diff --git a/3rdparty/libjpeg/CMakeLists.txt b/3rdparty/libjpeg/CMakeLists.txt
index 708e63e25b..e8b6528765 100644
--- a/3rdparty/libjpeg/CMakeLists.txt
+++ b/3rdparty/libjpeg/CMakeLists.txt
@@ -9,6 +9,12 @@ ocv_include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 file(GLOB lib_srcs *.c)
 file(GLOB lib_hdrs *.h)
 
+if(ANDROID OR IOS)
+  ocv_list_filterout(lib_srcs jmemansi.c)
+else()
+  ocv_list_filterout(lib_srcs jmemnobs.c)
+endif()
+
 # ----------------------------------------------------------------------------------
 #         Define the library target:
 # ----------------------------------------------------------------------------------
@@ -25,7 +31,8 @@ if(CMAKE_COMPILER_IS_GNUCXX)
   set_source_files_properties(jcdctmgr.c PROPERTIES COMPILE_FLAGS "-O1")
 endif()
 
-ocv_warnings_disable(CMAKE_C_FLAGS -Wcast-align -Wshadow -Wunused)
+ocv_warnings_disable(CMAKE_C_FLAGS -Wcast-align -Wshadow -Wunused -Wmain -Wimplicit-function-declaration)
+ocv_warnings_disable(CMAKE_C_FLAGS /wd4013 /wd4244 /wd4267) # vs2005
 
 set_target_properties(${JPEG_LIBRARY}
   PROPERTIES OUTPUT_NAME ${JPEG_LIBRARY}
diff --git a/3rdparty/libjpeg/README b/3rdparty/libjpeg/README
index 86cc20669d..0a23c19590 100644
--- a/3rdparty/libjpeg/README
+++ b/3rdparty/libjpeg/README
@@ -1,24 +1,20 @@
 The Independent JPEG Group's JPEG software
 ==========================================
 
-README for release 6b of 27-Mar-1998
+README for release 8d of 15-Jan-2012
 ====================================
 
-This distribution contains the sixth public release of the Independent JPEG
+This distribution contains the eighth public release of the Independent JPEG
 Group's free JPEG software.  You are welcome to redistribute this software and
 to use it for any purpose, subject to the conditions under LEGAL ISSUES, below.
 
-Serious users of this software (particularly those incorporating it into
-larger programs) should contact IJG at jpeg-info@uunet.uu.net to be added to
-our electronic mailing list.  Mailing list members are notified of updates
-and have a chance to participate in technical discussions, etc.
+This software is the work of Tom Lane, Guido Vollbeding, Philip Gladstone,
+Bill Allombert, Jim Boucher, Lee Crocker, Bob Friesenhahn, Ben Jackson,
+Julian Minguillon, Luis Ortiz, George Phillips, Davide Rossi, Ge' Weijers,
+and other members of the Independent JPEG Group.
 
-This software is the work of Tom Lane, Philip Gladstone, Jim Boucher,
-Lee Crocker, Julian Minguillon, Luis Ortiz, George Phillips, Davide Rossi,
-Guido Vollbeding, Ge' Weijers, and other members of the Independent JPEG
-Group.
-
-IJG is not affiliated with the official ISO JPEG standards committee.
+IJG is not affiliated with the ISO/IEC JTC1/SC29/WG1 standards committee
+(also known as JPEG, together with ITU-T SG16).
 
 
 DOCUMENTATION ROADMAP
@@ -30,27 +26,27 @@ OVERVIEW            General description of JPEG and the IJG software.
 LEGAL ISSUES        Copyright, lack of warranty, terms of distribution.
 REFERENCES          Where to learn more about JPEG.
 ARCHIVE LOCATIONS   Where to find newer versions of this software.
-RELATED SOFTWARE    Other stuff you should get.
+ACKNOWLEDGMENTS     Special thanks.
 FILE FORMAT WARS    Software *not* to get.
 TO DO               Plans for future IJG releases.
 
 Other documentation files in the distribution are:
 
 User documentation:
-  install.doc       How to configure and install the IJG software.
-  usage.doc         Usage instructions for cjpeg, djpeg, jpegtran,
+  install.txt       How to configure and install the IJG software.
+  usage.txt         Usage instructions for cjpeg, djpeg, jpegtran,
                     rdjpgcom, and wrjpgcom.
-  *.1               Unix-style man pages for programs (same info as usage.doc).
-  wizard.doc        Advanced usage instructions for JPEG wizards only.
+  *.1               Unix-style man pages for programs (same info as usage.txt).
+  wizard.txt        Advanced usage instructions for JPEG wizards only.
   change.log        Version-to-version change highlights.
 Programmer and internal documentation:
-  libjpeg.doc       How to use the JPEG library in your own programs.
+  libjpeg.txt       How to use the JPEG library in your own programs.
   example.c         Sample code for calling the JPEG library.
-  structure.doc     Overview of the JPEG library's internal structure.
-  filelist.doc      Road map of IJG files.
-  coderules.doc     Coding style rules --- please read if you contribute code.
+  structure.txt     Overview of the JPEG library's internal structure.
+  filelist.txt      Road map of IJG files.
+  coderules.txt     Coding style rules --- please read if you contribute code.
 
-Please read at least the files install.doc and usage.doc.  Useful information
+Please read at least the files install.txt and usage.txt.  Some information
 can also be found in the JPEG FAQ (Frequently Asked Questions) article.  See
 ARCHIVE LOCATIONS below to find out where to obtain the FAQ article.
 
@@ -62,24 +58,15 @@ the order listed) before diving into the code.
 OVERVIEW
 ========
 
-This package contains C software to implement JPEG image compression and
-decompression.  JPEG (pronounced "jay-peg") is a standardized compression
-method for full-color and gray-scale images.  JPEG is intended for compressing
-"real-world" scenes; line drawings, cartoons and other non-realistic images
-are not its strong suit.  JPEG is lossy, meaning that the output image is not
-exactly identical to the input image.  Hence you must not use JPEG if you
-have to have identical output bits.  However, on typical photographic images,
-very good compression levels can be obtained with no visible change, and
-remarkably high compression levels are possible if you can tolerate a
-low-quality image.  For more details, see the references, or just experiment
-with various compression settings.
+This package contains C software to implement JPEG image encoding, decoding,
+and transcoding.  JPEG (pronounced "jay-peg") is a standardized compression
+method for full-color and gray-scale images.
 
 This software implements JPEG baseline, extended-sequential, and progressive
 compression processes.  Provision is made for supporting all variants of these
 processes, although some uncommon parameter settings aren't implemented yet.
-For legal reasons, we are not distributing code for the arithmetic-coding
-variants of JPEG; see LEGAL ISSUES.  We have made no provision for supporting
-the hierarchical or lossless processes defined in the standard.
+We have made no provision for supporting the hierarchical or lossless
+processes defined in the standard.
 
 We provide a set of library routines for reading and writing JPEG image files,
 plus two sample applications "cjpeg" and "djpeg", which use the library to
@@ -91,10 +78,11 @@ considerable functionality beyond the bare JPEG coding/decoding capability;
 for example, the color quantization modules are not strictly part of JPEG
 decoding, but they are essential for output to colormapped file formats or
 colormapped displays.  These extra functions can be compiled out of the
-library if not required for a particular application.  We have also included
-"jpegtran", a utility for lossless transcoding between different JPEG
-processes, and "rdjpgcom" and "wrjpgcom", two simple applications for
-inserting and extracting textual comments in JFIF files.
+library if not required for a particular application.
+
+We have also included "jpegtran", a utility for lossless transcoding between
+different JPEG processes, and "rdjpgcom" and "wrjpgcom", two simple
+applications for inserting and extracting textual comments in JFIF files.
 
 The emphasis in designing this software has been on achieving portability and
 flexibility, while also making it fast enough to be useful.  In particular,
@@ -127,7 +115,7 @@ with respect to this software, its quality, accuracy, merchantability, or
 fitness for a particular purpose.  This software is provided "AS IS", and you,
 its user, assume the entire risk as to its quality and accuracy.
 
-This software is copyright (C) 1991-1998, Thomas G. Lane.
+This software is copyright (C) 1991-2012, Thomas G. Lane, Guido Vollbeding.
 All Rights Reserved except as specified below.
 
 Permission is hereby granted to use, copy, modify, and distribute this
@@ -170,17 +158,8 @@ the foregoing paragraphs do.
 The Unix configuration script "configure" was produced with GNU Autoconf.
 It is copyright by the Free Software Foundation but is freely distributable.
 The same holds for its supporting scripts (config.guess, config.sub,
-ltconfig, ltmain.sh).  Another support script, install-sh, is copyright
-by M.I.T. but is also freely distributable.
-
-It appears that the arithmetic coding option of the JPEG spec is covered by
-patents owned by IBM, AT&T, and Mitsubishi.  Hence arithmetic coding cannot
-legally be used without obtaining one or more licenses.  For this reason,
-support for arithmetic coding has been removed from the free JPEG software.
-(Since arithmetic coding provides only a marginal gain over the unpatented
-Huffman mode, it is unlikely that very many implementations will support it.)
-So far as we are aware, there are no patent restrictions on the remaining
-code.
+ltmain.sh).  Another support script, install-sh, is copyright by X Consortium
+but is also freely distributable.
 
 The IJG distribution formerly included code to read and write GIF files.
 To avoid entanglement with the Unisys LZW patent, GIF reading support has
@@ -198,7 +177,7 @@ We are required to state that
 REFERENCES
 ==========
 
-We highly recommend reading one or more of these references before trying to
+We recommend reading one or more of these references before trying to
 understand the innards of the JPEG software.
 
 The best short technical introduction to the JPEG compression algorithm is
@@ -207,7 +186,7 @@ The best short technical introduction to the JPEG compression algorithm is
 (Adjacent articles in that issue discuss MPEG motion picture compression,
 applications of JPEG, and related topics.)  If you don't have the CACM issue
 handy, a PostScript file containing a revised version of Wallace's article is
-available at ftp://ftp.uu.net/graphics/jpeg/wallace.ps.gz.  The file (actually
+available at http://www.ijg.org/files/wallace.ps.gz.  The file (actually
 a preprint for an article that appeared in IEEE Trans. Consumer Electronics)
 omits the sample images that appeared in CACM, but it includes corrections
 and some added material.  Note: the Wallace article is copyright ACM and IEEE,
@@ -222,82 +201,67 @@ code but don't know much about data compression in general.  The book's JPEG
 sample code is far from industrial-strength, but when you are ready to look
 at a full implementation, you've got one here...
 
-The best full description of JPEG is the textbook "JPEG Still Image Data
-Compression Standard" by William B. Pennebaker and Joan L. Mitchell, published
-by Van Nostrand Reinhold, 1993, ISBN 0-442-01272-1.  Price US$59.95, 638 pp.
-The book includes the complete text of the ISO JPEG standards (DIS 10918-1
-and draft DIS 10918-2).  This is by far the most complete exposition of JPEG
-in existence, and we highly recommend it.
-
-The JPEG standard itself is not available electronically; you must order a
-paper copy through ISO or ITU.  (Unless you feel a need to own a certified
-official copy, we recommend buying the Pennebaker and Mitchell book instead;
-it's much cheaper and includes a great deal of useful explanatory material.)
-In the USA, copies of the standard may be ordered from ANSI Sales at (212)
-642-4900, or from Global Engineering Documents at (800) 854-7179.  (ANSI
-doesn't take credit card orders, but Global does.)  It's not cheap: as of
-1992, ANSI was charging $95 for Part 1 and $47 for Part 2, plus 7%
-shipping/handling.  The standard is divided into two parts, Part 1 being the
-actual specification, while Part 2 covers compliance testing methods.  Part 1
-is titled "Digital Compression and Coding of Continuous-tone Still Images,
+The best currently available description of JPEG is the textbook "JPEG Still
+Image Data Compression Standard" by William B. Pennebaker and Joan L.
+Mitchell, published by Van Nostrand Reinhold, 1993, ISBN 0-442-01272-1.
+Price US$59.95, 638 pp.  The book includes the complete text of the ISO JPEG
+standards (DIS 10918-1 and draft DIS 10918-2).
+Although this is by far the most detailed and comprehensive exposition of
+JPEG publicly available, we point out that it is still missing an explanation
+of the most essential properties and algorithms of the underlying DCT
+technology.
+If you think that you know about DCT-based JPEG after reading this book,
+then you are in delusion.  The real fundamentals and corresponding potential
+of DCT-based JPEG are not publicly known so far, and that is the reason for
+all the mistaken developments taking place in the image coding domain.
+
+The original JPEG standard is divided into two parts, Part 1 being the actual
+specification, while Part 2 covers compliance testing methods.  Part 1 is
+titled "Digital Compression and Coding of Continuous-tone Still Images,
 Part 1: Requirements and guidelines" and has document numbers ISO/IEC IS
 10918-1, ITU-T T.81.  Part 2 is titled "Digital Compression and Coding of
 Continuous-tone Still Images, Part 2: Compliance testing" and has document
 numbers ISO/IEC IS 10918-2, ITU-T T.83.
-
-Some extensions to the original JPEG standard are defined in JPEG Part 3,
-a newer ISO standard numbered ISO/IEC IS 10918-3 and ITU-T T.84.  IJG
-currently does not support any Part 3 extensions.
+IJG JPEG 8 introduces an implementation of the JPEG SmartScale extension
+which is specified in two documents:  A contributed document at ITU and ISO
+with title "ITU-T JPEG-Plus Proposal for Extending ITU-T T.81 for Advanced
+Image Coding", April 2006, Geneva, Switzerland.  The latest version of this
+document is Revision 3.  And a contributed document ISO/IEC JTC1/SC29/WG1 N
+5799 with title "Evolution of JPEG", June/July 2011, Berlin, Germany.
 
 The JPEG standard does not specify all details of an interchangeable file
 format.  For the omitted details we follow the "JFIF" conventions, revision
-1.02.  A copy of the JFIF spec is available from:
-	Literature Department
-	C-Cube Microsystems, Inc.
-	1778 McCarthy Blvd.
-	Milpitas, CA 95035
-	phone (408) 944-6300,  fax (408) 944-6314
-A PostScript version of this document is available by FTP at
-ftp://ftp.uu.net/graphics/jpeg/jfif.ps.gz.  There is also a plain text
-version at ftp://ftp.uu.net/graphics/jpeg/jfif.txt.gz, but it is missing
-the figures.
+1.02.  JFIF 1.02 has been adopted as an Ecma International Technical Report
+and thus received a formal publication status.  It is available as a free
+download in PDF format from
+http://www.ecma-international.org/publications/techreports/E-TR-098.htm.
+A PostScript version of the JFIF document is available at
+http://www.ijg.org/files/jfif.ps.gz.  There is also a plain text version at
+http://www.ijg.org/files/jfif.txt.gz, but it is missing the figures.
 
 The TIFF 6.0 file format specification can be obtained by FTP from
 ftp://ftp.sgi.com/graphics/tiff/TIFF6.ps.gz.  The JPEG incorporation scheme
 found in the TIFF 6.0 spec of 3-June-92 has a number of serious problems.
 IJG does not recommend use of the TIFF 6.0 design (TIFF Compression tag 6).
 Instead, we recommend the JPEG design proposed by TIFF Technical Note #2
-(Compression tag 7).  Copies of this Note can be obtained from ftp.sgi.com or
-from ftp://ftp.uu.net/graphics/jpeg/.  It is expected that the next revision
+(Compression tag 7).  Copies of this Note can be obtained from
+http://www.ijg.org/files/.  It is expected that the next revision
 of the TIFF spec will replace the 6.0 JPEG design with the Note's design.
 Although IJG's own code does not support TIFF/JPEG, the free libtiff library
-uses our library to implement TIFF/JPEG per the Note.  libtiff is available
-from ftp://ftp.sgi.com/graphics/tiff/.
+uses our library to implement TIFF/JPEG per the Note.
 
 
 ARCHIVE LOCATIONS
 =================
 
-The "official" archive site for this software is ftp.uu.net (Internet
-address 192.48.96.9).  The most recent released version can always be found
-there in directory graphics/jpeg.  This particular version will be archived
-as ftp://ftp.uu.net/graphics/jpeg/jpegsrc.v6b.tar.gz.  If you don't have
-direct Internet access, UUNET's archives are also available via UUCP; contact
-help@uunet.uu.net for information on retrieving files that way.
-
-Numerous Internet sites maintain copies of the UUNET files.  However, only
-ftp.uu.net is guaranteed to have the latest official version.
-
-You can also obtain this software in DOS-compatible "zip" archive format from
-the SimTel archives (ftp://ftp.simtel.net/pub/simtelnet/msdos/graphics/), or
-on CompuServe in the Graphics Support forum (GO CIS:GRAPHSUP), library 12
-"JPEG Tools".  Again, these versions may sometimes lag behind the ftp.uu.net
-release.
-
-The JPEG FAQ (Frequently Asked Questions) article is a useful source of
-general information about JPEG.  It is updated constantly and therefore is
-not included in this distribution.  The FAQ is posted every two weeks to
-Usenet newsgroups comp.graphics.misc, news.answers, and other groups.
+The "official" archive site for this software is www.ijg.org.
+The most recent released version can always be found there in
+directory "files".  This particular version will be archived as
+http://www.ijg.org/files/jpegsrc.v8d.tar.gz, and in Windows-compatible
+"zip" archive format as http://www.ijg.org/files/jpegsr8d.zip.
+
+The JPEG FAQ (Frequently Asked Questions) article is a source of some
+general information about JPEG.
 It is available on the World Wide Web at http://www.faqs.org/faqs/jpeg-faq/
 and other news.answers archive sites, including the official news.answers
 archive at rtfm.mit.edu: ftp://rtfm.mit.edu/pub/usenet/news.answers/jpeg-faq/.
@@ -307,79 +271,81 @@ with body
 	send usenet/news.answers/jpeg-faq/part2
 
 
-RELATED SOFTWARE
-================
+ACKNOWLEDGMENTS
+===============
+
+Thank to Juergen Bruder for providing me with a copy of the common DCT
+algorithm article, only to find out that I had come to the same result
+in a more direct and comprehensible way with a more generative approach.
+
+Thank to Istvan Sebestyen and Joan L. Mitchell for inviting me to the
+ITU JPEG (Study Group 16) meeting in Geneva, Switzerland.
 
-Numerous viewing and image manipulation programs now support JPEG.  (Quite a
-few of them use this library to do so.)  The JPEG FAQ described above lists
-some of the more popular free and shareware viewers, and tells where to
-obtain them on Internet.
+Thank to Thomas Wiegand and Gary Sullivan for inviting me to the
+Joint Video Team (MPEG & ITU) meeting in Geneva, Switzerland.
 
-If you are on a Unix machine, we highly recommend Jef Poskanzer's free
-PBMPLUS software, which provides many useful operations on PPM-format image
-files.  In particular, it can convert PPM images to and from a wide range of
-other formats, thus making cjpeg/djpeg considerably more useful.  The latest
-version is distributed by the NetPBM group, and is available from numerous
-sites, notably ftp://wuarchive.wustl.edu/graphics/graphics/packages/NetPBM/.
-Unfortunately PBMPLUS/NETPBM is not nearly as portable as the IJG software is;
-you are likely to have difficulty making it work on any non-Unix machine.
+Thank to Thomas Richter and Daniel Lee for inviting me to the
+ISO/IEC JTC1/SC29/WG1 (also known as JPEG, together with ITU-T SG16)
+meeting in Berlin, Germany.
 
-A different free JPEG implementation, written by the PVRG group at Stanford,
-is available from ftp://havefun.stanford.edu/pub/jpeg/.  This program
-is designed for research and experimentation rather than production use;
-it is slower, harder to use, and less portable than the IJG code, but it
-is easier to read and modify.  Also, the PVRG code supports lossless JPEG,
-which we do not.  (On the other hand, it doesn't do progressive JPEG.)
+Thank to John Korejwa and Massimo Ballerini for inviting me to
+fruitful consultations in Boston, MA and Milan, Italy.
+
+Thank to Hendrik Elstner, Roland Fassauer, Simone Zuck, Guenther
+Maier-Gerber, Walter Stoeber, Fred Schmitz, and Norbert Braunagel
+for corresponding business development.
+
+Thank to Nico Zschach and Dirk Stelling of the technical support team
+at the Digital Images company in Halle for providing me with extra
+equipment for configuration tests.
+
+Thank to Richard F. Lyon (then of Foveon Inc.) for fruitful
+communication about JPEG configuration in Sigma Photo Pro software.
+
+Thank to Andrew Finkenstadt for hosting the ijg.org site.
+
+Last but not least special thank to Thomas G. Lane for the original
+design and development of this singular software package.
 
 
 FILE FORMAT WARS
 ================
 
-Some JPEG programs produce files that are not compatible with our library.
-The root of the problem is that the ISO JPEG committee failed to specify a
-concrete file format.  Some vendors "filled in the blanks" on their own,
-creating proprietary formats that no one else could read.  (For example, none
-of the early commercial JPEG implementations for the Macintosh were able to
-exchange compressed files.)
-
-The file format we have adopted is called JFIF (see REFERENCES).  This format
-has been agreed to by a number of major commercial JPEG vendors, and it has
-become the de facto standard.  JFIF is a minimal or "low end" representation.
-We recommend the use of TIFF/JPEG (TIFF revision 6.0 as modified by TIFF
-Technical Note #2) for "high end" applications that need to record a lot of
-additional data about an image.  TIFF/JPEG is fairly new and not yet widely
-supported, unfortunately.
-
-The upcoming JPEG Part 3 standard defines a file format called SPIFF.
-SPIFF is interoperable with JFIF, in the sense that most JFIF decoders should
-be able to read the most common variant of SPIFF.  SPIFF has some technical
-advantages over JFIF, but its major claim to fame is simply that it is an
-official standard rather than an informal one.  At this point it is unclear
-whether SPIFF will supersede JFIF or whether JFIF will remain the de-facto
-standard.  IJG intends to support SPIFF once the standard is frozen, but we
-have not decided whether it should become our default output format or not.
-(In any case, our decoder will remain capable of reading JFIF indefinitely.)
-
-Various proprietary file formats incorporating JPEG compression also exist.
-We have little or no sympathy for the existence of these formats.  Indeed,
-one of the original reasons for developing this free software was to help
-force convergence on common, open format standards for JPEG files.  Don't
-use a proprietary file format!
+The ISO/IEC JTC1/SC29/WG1 standards committee (also known as JPEG, together
+with ITU-T SG16) currently promotes different formats containing the name
+"JPEG" which is misleading because these formats are incompatible with
+original DCT-based JPEG and are based on faulty technologies.
+IJG therefore does not and will not support such momentary mistakes
+(see REFERENCES).
+There exist also distributions under the name "OpenJPEG" promoting such
+kind of formats which is misleading because they don't support original
+JPEG images.
+We have no sympathy for the promotion of inferior formats.  Indeed, one of
+the original reasons for developing this free software was to help force
+convergence on common, interoperable format standards for JPEG files.
+Don't use an incompatible file format!
+(In any case, our decoder will remain capable of reading existing JPEG
+image files indefinitely.)
+
+Furthermore, the ISO committee pretends to be "responsible for the popular
+JPEG" in their public reports which is not true because they don't respond to
+actual requirements for the maintenance of the original JPEG specification.
+
+There are currently distributions in circulation containing the name
+"libjpeg" which claim to be a "derivative" or "fork" of the original
+libjpeg, but don't have the features and are incompatible with formats
+supported by actual IJG libjpeg distributions.  Furthermore, they
+violate the license conditions as described under LEGAL ISSUES above.
+We have no sympathy for the release of misleading and illegal
+distributions derived from obsolete code bases.
+Don't use an obsolete code base!
 
 
 TO DO
 =====
 
-The major thrust for v7 will probably be improvement of visual quality.
-The current method for scaling the quantization tables is known not to be
-very good at low Q values.  We also intend to investigate block boundary
-smoothing, "poor man's variable quantization", and other means of improving
-quality-vs-file-size performance without sacrificing compatibility.
-
-In future versions, we are considering supporting some of the upcoming JPEG
-Part 3 extensions --- principally, variable quantization and the SPIFF file
-format.
-
-As always, speeding things up is of great interest.
+Version 8 is the first release of a new generation JPEG standard
+to overcome the limitations of the original JPEG specification.
+More features are being prepared for coming releases...
 
-Please send bug reports, offers of help, etc. to jpeg-info@uunet.uu.net.
+Please send bug reports, offers of help, etc. to jpeg-info@jpegclub.org.
diff --git a/3rdparty/libjpeg/change.log b/3rdparty/libjpeg/change.log
new file mode 100644
index 0000000000..729bca6cfa
--- /dev/null
+++ b/3rdparty/libjpeg/change.log
@@ -0,0 +1,346 @@
+CHANGE LOG for Independent JPEG Group's JPEG software
+
+
+Version 8d  15-Jan-2012
+-----------------------
+
+Add cjpeg -rgb option to create RGB JPEG files.
+Using this switch suppresses the conversion from RGB
+colorspace input to the default YCbCr JPEG colorspace.
+This feature allows true lossless JPEG coding of RGB color images.
+The recommended command for this purpose is currently
+cjpeg -rgb -block 1 -arithmetic.
+SmartScale capable decoder (introduced with IJG JPEG 8) required.
+Thank to Michael Koch for the initial suggestion.
+
+Add option to disable the region adjustment in the transupp crop code.
+Thank to Jeffrey Friedl for the suggestion.
+
+Thank to Richard Jones and Edd Dawson for various minor corrections.
+
+Thank to Akim Demaille for configure.ac cleanup.
+
+
+Version 8c  16-Jan-2011
+-----------------------
+
+Add option to compression library and cjpeg (-block N) to use
+different DCT block size.
+All N from 1 to 16 are possible.  Default is 8 (baseline format).
+Larger values produce higher compression,
+smaller values produce higher quality.
+SmartScale capable decoder (introduced with IJG JPEG 8) required.
+
+
+Version 8b  16-May-2010
+-----------------------
+
+Repair problem in new memory source manager with corrupt JPEG data.
+Thank to Ted Campbell and Samuel Chun for the report.
+
+Repair problem in Makefile.am test target.
+Thank to anonymous user for the report.
+
+Support MinGW installation with automatic configure.
+Thank to Volker Grabsch for the suggestion.
+
+
+Version 8a  28-Feb-2010
+-----------------------
+
+Writing tables-only datastreams via jpeg_write_tables works again.
+
+Support 32-bit BMPs (RGB image with Alpha channel) for read in cjpeg.
+Thank to Brett Blackham for the suggestion.
+
+Improve accuracy in floating point IDCT calculation.
+Thank to Robert Hooke for the hint.
+
+
+Version 8  10-Jan-2010
+----------------------
+
+jpegtran now supports the same -scale option as djpeg for "lossless" resize.
+An implementation of the JPEG SmartScale extension is required for this
+feature.  A (draft) specification of the JPEG SmartScale extension is
+available as a contributed document at ITU and ISO.  Revision 2 or later
+of the document is required (latest document version is Revision 3).
+The SmartScale extension will enable more features beside lossless resize
+in future implementations, as described in the document (new compression
+options).
+
+Add sanity check in BMP reader module to avoid cjpeg crash for empty input
+image (thank to Isaev Ildar of ISP RAS, Moscow, RU for reporting this error).
+
+Add data source and destination managers for read from and write to
+memory buffers.  New API functions jpeg_mem_src and jpeg_mem_dest.
+Thank to Roberto Boni from Italy for the suggestion.
+
+
+Version 7  27-Jun-2009
+----------------------
+
+New scaled DCTs implemented.
+djpeg now supports scalings N/8 with all N from 1 to 16.
+cjpeg now supports scalings 8/N with all N from 1 to 16.
+Scaled DCTs with size larger than 8 are now also used for resolving the
+common 2x2 chroma subsampling case without additional spatial resampling.
+Separate spatial resampling for those kind of files is now only necessary
+for N>8 scaling cases.
+Furthermore, separate scaled DCT functions are provided for direct resolving
+of the common asymmetric subsampling cases (2x1 and 1x2) without additional
+spatial resampling.
+
+cjpeg -quality option has been extended for support of separate quality
+settings for luminance and chrominance (or in general, for every provided
+quantization table slot).
+New API function jpeg_default_qtables() and q_scale_factor array in library.
+
+Added -nosmooth option to cjpeg, complementary to djpeg.
+New variable "do_fancy_downsampling" in library, complement to fancy
+upsampling.  Fancy upsampling now uses direct DCT scaling with sizes
+larger than 8.  The old method is not reversible and has been removed.
+
+Support arithmetic entropy encoding and decoding.
+Added files jaricom.c, jcarith.c, jdarith.c.
+
+Straighten the file structure:
+Removed files jidctred.c, jcphuff.c, jchuff.h, jdphuff.c, jdhuff.h.
+
+jpegtran has a new "lossless" cropping feature.
+
+Implement -perfect option in jpegtran, new API function
+jtransform_perfect_transform() in transupp. (DP 204_perfect.dpatch)
+
+Better error messages for jpegtran fopen failure.
+(DP 203_jpegtran_errmsg.dpatch)
+
+Fix byte order issue with 16bit PPM/PGM files in rdppm.c/wrppm.c:
+according to Netpbm, the de facto standard implementation of the PNM formats,
+the most significant byte is first. (DP 203_rdppm.dpatch)
+
+Add -raw option to rdjpgcom not to mangle the output.
+(DP 205_rdjpgcom_raw.dpatch)
+
+Make rdjpgcom locale aware. (DP 201_rdjpgcom_locale.dpatch)
+
+Add extern "C" to jpeglib.h.
+This avoids the need to put extern "C" { ... } around #include "jpeglib.h"
+in your C++ application.  Defining the symbol DONT_USE_EXTERN_C in the
+configuration prevents this. (DP 202_jpeglib.h_c++.dpatch)
+
+
+Version 6b  27-Mar-1998
+-----------------------
+
+jpegtran has new features for lossless image transformations (rotation
+and flipping) as well as "lossless" reduction to grayscale.
+
+jpegtran now copies comments by default; it has a -copy switch to enable
+copying all APPn blocks as well, or to suppress comments.  (Formerly it
+always suppressed comments and APPn blocks.)  jpegtran now also preserves
+JFIF version and resolution information.
+
+New decompressor library feature: COM and APPn markers found in the input
+file can be saved in memory for later use by the application.  (Before,
+you had to code this up yourself with a custom marker processor.)
+
+There is an unused field "void * client_data" now in compress and decompress
+parameter structs; this may be useful in some applications.
+
+JFIF version number information is now saved by the decoder and accepted by
+the encoder.  jpegtran uses this to copy the source file's version number,
+to ensure "jpegtran -copy all" won't create bogus files that contain JFXX
+extensions but claim to be version 1.01.  Applications that generate their
+own JFXX extension markers also (finally) have a supported way to cause the
+encoder to emit JFIF version number 1.02.
+
+djpeg's trace mode reports JFIF 1.02 thumbnail images as such, rather
+than as unknown APP0 markers.
+
+In -verbose mode, djpeg and rdjpgcom will try to print the contents of
+APP12 markers as text.  Some digital cameras store useful text information
+in APP12 markers.
+
+Handling of truncated data streams is more robust: blocks beyond the one in
+which the error occurs will be output as uniform gray, or left unchanged
+if decoding a progressive JPEG.  The appearance no longer depends on the
+Huffman tables being used.
+
+Huffman tables are checked for validity much more carefully than before.
+
+To avoid the Unisys LZW patent, djpeg's GIF output capability has been
+changed to produce "uncompressed GIFs", and cjpeg's GIF input capability
+has been removed altogether.  We're not happy about it either, but there
+seems to be no good alternative.
+
+The configure script now supports building libjpeg as a shared library
+on many flavors of Unix (all the ones that GNU libtool knows how to
+build shared libraries for).  Use "./configure --enable-shared" to
+try this out.
+
+New jconfig file and makefiles for Microsoft Visual C++ and Developer Studio.
+Also, a jconfig file and a build script for Metrowerks CodeWarrior
+on Apple Macintosh.  makefile.dj has been updated for DJGPP v2, and there
+are miscellaneous other minor improvements in the makefiles.
+
+jmemmac.c now knows how to create temporary files following Mac System 7
+conventions.
+
+djpeg's -map switch is now able to read raw-format PPM files reliably.
+
+cjpeg -progressive -restart no longer generates any unnecessary DRI markers.
+
+Multiple calls to jpeg_simple_progression for a single JPEG object
+no longer leak memory.
+
+
+Version 6a  7-Feb-96
+--------------------
+
+Library initialization sequence modified to detect version mismatches
+and struct field packing mismatches between library and calling application.
+This change requires applications to be recompiled, but does not require
+any application source code change.
+
+All routine declarations changed to the style "GLOBAL(type) name ...",
+that is, GLOBAL, LOCAL, METHODDEF, EXTERN are now macros taking the
+routine's return type as an argument.  This makes it possible to add
+Microsoft-style linkage keywords to all the routines by changing just
+these macros.  Note that any application code that was using these macros
+will have to be changed.
+
+DCT coefficient quantization tables are now stored in normal array order
+rather than zigzag order.  Application code that calls jpeg_add_quant_table,
+or otherwise manipulates quantization tables directly, will need to be
+changed.  If you need to make such code work with either older or newer
+versions of the library, a test like "#if JPEG_LIB_VERSION >= 61" is
+recommended.
+
+djpeg's trace capability now dumps DQT tables in natural order, not zigzag
+order.  This allows the trace output to be made into a "-qtables" file
+more easily.
+
+New system-dependent memory manager module for use on Apple Macintosh.
+
+Fix bug in cjpeg's -smooth option: last one or two scanlines would be
+duplicates of the prior line unless the image height mod 16 was 1 or 2.
+
+Repair minor problems in VMS, BCC, MC6 makefiles.
+
+New configure script based on latest GNU Autoconf.
+
+Correct the list of include files needed by MetroWerks C for ccommand().
+
+Numerous small documentation updates.
+
+
+Version 6  2-Aug-95
+-------------------
+
+Progressive JPEG support: library can read and write full progressive JPEG
+files.  A "buffered image" mode supports incremental decoding for on-the-fly
+display of progressive images.  Simply recompiling an existing IJG-v5-based
+decoder with v6 should allow it to read progressive files, though of course
+without any special progressive display.
+
+New "jpegtran" application performs lossless transcoding between different
+JPEG formats; primarily, it can be used to convert baseline to progressive
+JPEG and vice versa.  In support of jpegtran, the library now allows lossless
+reading and writing of JPEG files as DCT coefficient arrays.  This ability
+may be of use in other applications.
+
+Notes for programmers:
+* We changed jpeg_start_decompress() to be able to suspend; this makes all
+decoding modes available to suspending-input applications.  However,
+existing applications that use suspending input will need to be changed
+to check the return value from jpeg_start_decompress().  You don't need to
+do anything if you don't use a suspending data source.
+* We changed the interface to the virtual array routines: access_virt_array
+routines now take a count of the number of rows to access this time.  The
+last parameter to request_virt_array routines is now interpreted as the
+maximum number of rows that may be accessed at once, but not necessarily
+the height of every access.
+
+
+Version 5b  15-Mar-95
+---------------------
+
+Correct bugs with grayscale images having v_samp_factor > 1.
+
+jpeg_write_raw_data() now supports output suspension.
+
+Correct bugs in "configure" script for case of compiling in
+a directory other than the one containing the source files.
+
+Repair bug in jquant1.c: sometimes didn't use as many colors as it could.
+
+Borland C makefile and jconfig file work under either MS-DOS or OS/2.
+
+Miscellaneous improvements to documentation.
+
+
+Version 5a  7-Dec-94
+--------------------
+
+Changed color conversion roundoff behavior so that grayscale values are
+represented exactly.  (This causes test image files to change.)
+
+Make ordered dither use 16x16 instead of 4x4 pattern for a small quality
+improvement.
+
+New configure script based on latest GNU Autoconf.
+Fix configure script to handle CFLAGS correctly.
+Rename *.auto files to *.cfg, so that configure script still works if
+file names have been truncated for DOS.
+
+Fix bug in rdbmp.c: didn't allow for extra data between header and image.
+
+Modify rdppm.c/wrppm.c to handle 2-byte raw PPM/PGM formats for 12-bit data.
+
+Fix several bugs in rdrle.c.
+
+NEED_SHORT_EXTERNAL_NAMES option was broken.
+
+Revise jerror.h/jerror.c for more flexibility in message table.
+
+Repair oversight in jmemname.c NO_MKTEMP case: file could be there
+but unreadable.
+
+
+Version 5  24-Sep-94
+--------------------
+
+Version 5 represents a nearly complete redesign and rewrite of the IJG
+software.  Major user-visible changes include:
+  * Automatic configuration simplifies installation for most Unix systems.
+  * A range of speed vs. image quality tradeoffs are supported.
+    This includes resizing of an image during decompression: scaling down
+    by a factor of 1/2, 1/4, or 1/8 is handled very efficiently.
+  * New programs rdjpgcom and wrjpgcom allow insertion and extraction
+    of text comments in a JPEG file.
+
+The application programmer's interface to the library has changed completely.
+Notable improvements include:
+  * We have eliminated the use of callback routines for handling the
+    uncompressed image data.  The application now sees the library as a
+    set of routines that it calls to read or write image data on a
+    scanline-by-scanline basis.
+  * The application image data is represented in a conventional interleaved-
+    pixel format, rather than as a separate array for each color channel.
+    This can save a copying step in many programs.
+  * The handling of compressed data has been cleaned up: the application can
+    supply routines to source or sink the compressed data.  It is possible to
+    suspend processing on source/sink buffer overrun, although this is not
+    supported in all operating modes.
+  * All static state has been eliminated from the library, so that multiple
+    instances of compression or decompression can be active concurrently.
+  * JPEG abbreviated datastream formats are supported, ie, quantization and
+    Huffman tables can be stored separately from the image data.
+  * And not only that, but the documentation of the library has improved
+    considerably!
+
+
+The last widely used release before the version 5 rewrite was version 4A of
+18-Feb-93.  Change logs before that point have been discarded, since they
+are not of much interest after the rewrite.
diff --git a/3rdparty/libjpeg/jaricom.c b/3rdparty/libjpeg/jaricom.c
new file mode 100644
index 0000000000..690068861f
--- /dev/null
+++ b/3rdparty/libjpeg/jaricom.c
@@ -0,0 +1,153 @@
+/*
+ * jaricom.c
+ *
+ * Developed 1997-2011 by Guido Vollbeding.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file contains probability estimation tables for common use in
+ * arithmetic entropy encoding and decoding routines.
+ *
+ * This data represents Table D.3 in the JPEG spec (D.2 in the draft),
+ * ISO/IEC IS 10918-1 and CCITT Recommendation ITU-T T.81, and Table 24
+ * in the JBIG spec, ISO/IEC IS 11544 and CCITT Recommendation ITU-T T.82.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+/* The following #define specifies the packing of the four components
+ * into the compact INT32 representation.
+ * Note that this formula must match the actual arithmetic encoder
+ * and decoder implementation.  The implementation has to be changed
+ * if this formula is changed.
+ * The current organization is leaned on Markus Kuhn's JBIG
+ * implementation (jbig_tab.c).
+ */
+
+#define V(i,a,b,c,d) (((INT32)a << 16) | ((INT32)c << 8) | ((INT32)d << 7) | b)
+
+const INT32 jpeg_aritab[113+1] = {
+/*
+ * Index, Qe_Value, Next_Index_LPS, Next_Index_MPS, Switch_MPS
+ */
+  V(   0, 0x5a1d,   1,   1, 1 ),
+  V(   1, 0x2586,  14,   2, 0 ),
+  V(   2, 0x1114,  16,   3, 0 ),
+  V(   3, 0x080b,  18,   4, 0 ),
+  V(   4, 0x03d8,  20,   5, 0 ),
+  V(   5, 0x01da,  23,   6, 0 ),
+  V(   6, 0x00e5,  25,   7, 0 ),
+  V(   7, 0x006f,  28,   8, 0 ),
+  V(   8, 0x0036,  30,   9, 0 ),
+  V(   9, 0x001a,  33,  10, 0 ),
+  V(  10, 0x000d,  35,  11, 0 ),
+  V(  11, 0x0006,   9,  12, 0 ),
+  V(  12, 0x0003,  10,  13, 0 ),
+  V(  13, 0x0001,  12,  13, 0 ),
+  V(  14, 0x5a7f,  15,  15, 1 ),
+  V(  15, 0x3f25,  36,  16, 0 ),
+  V(  16, 0x2cf2,  38,  17, 0 ),
+  V(  17, 0x207c,  39,  18, 0 ),
+  V(  18, 0x17b9,  40,  19, 0 ),
+  V(  19, 0x1182,  42,  20, 0 ),
+  V(  20, 0x0cef,  43,  21, 0 ),
+  V(  21, 0x09a1,  45,  22, 0 ),
+  V(  22, 0x072f,  46,  23, 0 ),
+  V(  23, 0x055c,  48,  24, 0 ),
+  V(  24, 0x0406,  49,  25, 0 ),
+  V(  25, 0x0303,  51,  26, 0 ),
+  V(  26, 0x0240,  52,  27, 0 ),
+  V(  27, 0x01b1,  54,  28, 0 ),
+  V(  28, 0x0144,  56,  29, 0 ),
+  V(  29, 0x00f5,  57,  30, 0 ),
+  V(  30, 0x00b7,  59,  31, 0 ),
+  V(  31, 0x008a,  60,  32, 0 ),
+  V(  32, 0x0068,  62,  33, 0 ),
+  V(  33, 0x004e,  63,  34, 0 ),
+  V(  34, 0x003b,  32,  35, 0 ),
+  V(  35, 0x002c,  33,   9, 0 ),
+  V(  36, 0x5ae1,  37,  37, 1 ),
+  V(  37, 0x484c,  64,  38, 0 ),
+  V(  38, 0x3a0d,  65,  39, 0 ),
+  V(  39, 0x2ef1,  67,  40, 0 ),
+  V(  40, 0x261f,  68,  41, 0 ),
+  V(  41, 0x1f33,  69,  42, 0 ),
+  V(  42, 0x19a8,  70,  43, 0 ),
+  V(  43, 0x1518,  72,  44, 0 ),
+  V(  44, 0x1177,  73,  45, 0 ),
+  V(  45, 0x0e74,  74,  46, 0 ),
+  V(  46, 0x0bfb,  75,  47, 0 ),
+  V(  47, 0x09f8,  77,  48, 0 ),
+  V(  48, 0x0861,  78,  49, 0 ),
+  V(  49, 0x0706,  79,  50, 0 ),
+  V(  50, 0x05cd,  48,  51, 0 ),
+  V(  51, 0x04de,  50,  52, 0 ),
+  V(  52, 0x040f,  50,  53, 0 ),
+  V(  53, 0x0363,  51,  54, 0 ),
+  V(  54, 0x02d4,  52,  55, 0 ),
+  V(  55, 0x025c,  53,  56, 0 ),
+  V(  56, 0x01f8,  54,  57, 0 ),
+  V(  57, 0x01a4,  55,  58, 0 ),
+  V(  58, 0x0160,  56,  59, 0 ),
+  V(  59, 0x0125,  57,  60, 0 ),
+  V(  60, 0x00f6,  58,  61, 0 ),
+  V(  61, 0x00cb,  59,  62, 0 ),
+  V(  62, 0x00ab,  61,  63, 0 ),
+  V(  63, 0x008f,  61,  32, 0 ),
+  V(  64, 0x5b12,  65,  65, 1 ),
+  V(  65, 0x4d04,  80,  66, 0 ),
+  V(  66, 0x412c,  81,  67, 0 ),
+  V(  67, 0x37d8,  82,  68, 0 ),
+  V(  68, 0x2fe8,  83,  69, 0 ),
+  V(  69, 0x293c,  84,  70, 0 ),
+  V(  70, 0x2379,  86,  71, 0 ),
+  V(  71, 0x1edf,  87,  72, 0 ),
+  V(  72, 0x1aa9,  87,  73, 0 ),
+  V(  73, 0x174e,  72,  74, 0 ),
+  V(  74, 0x1424,  72,  75, 0 ),
+  V(  75, 0x119c,  74,  76, 0 ),
+  V(  76, 0x0f6b,  74,  77, 0 ),
+  V(  77, 0x0d51,  75,  78, 0 ),
+  V(  78, 0x0bb6,  77,  79, 0 ),
+  V(  79, 0x0a40,  77,  48, 0 ),
+  V(  80, 0x5832,  80,  81, 1 ),
+  V(  81, 0x4d1c,  88,  82, 0 ),
+  V(  82, 0x438e,  89,  83, 0 ),
+  V(  83, 0x3bdd,  90,  84, 0 ),
+  V(  84, 0x34ee,  91,  85, 0 ),
+  V(  85, 0x2eae,  92,  86, 0 ),
+  V(  86, 0x299a,  93,  87, 0 ),
+  V(  87, 0x2516,  86,  71, 0 ),
+  V(  88, 0x5570,  88,  89, 1 ),
+  V(  89, 0x4ca9,  95,  90, 0 ),
+  V(  90, 0x44d9,  96,  91, 0 ),
+  V(  91, 0x3e22,  97,  92, 0 ),
+  V(  92, 0x3824,  99,  93, 0 ),
+  V(  93, 0x32b4,  99,  94, 0 ),
+  V(  94, 0x2e17,  93,  86, 0 ),
+  V(  95, 0x56a8,  95,  96, 1 ),
+  V(  96, 0x4f46, 101,  97, 0 ),
+  V(  97, 0x47e5, 102,  98, 0 ),
+  V(  98, 0x41cf, 103,  99, 0 ),
+  V(  99, 0x3c3d, 104, 100, 0 ),
+  V( 100, 0x375e,  99,  93, 0 ),
+  V( 101, 0x5231, 105, 102, 0 ),
+  V( 102, 0x4c0f, 106, 103, 0 ),
+  V( 103, 0x4639, 107, 104, 0 ),
+  V( 104, 0x415e, 103,  99, 0 ),
+  V( 105, 0x5627, 105, 106, 1 ),
+  V( 106, 0x50e7, 108, 107, 0 ),
+  V( 107, 0x4b85, 109, 103, 0 ),
+  V( 108, 0x5597, 110, 109, 0 ),
+  V( 109, 0x504f, 111, 107, 0 ),
+  V( 110, 0x5a10, 110, 111, 1 ),
+  V( 111, 0x5522, 112, 109, 0 ),
+  V( 112, 0x59eb, 112, 111, 1 ),
+/*
+ * This last entry is used for fixed probability estimate of 0.5
+ * as suggested in Section 10.3 Table 5 of ITU-T Rec. T.851.
+ */
+  V( 113, 0x5a1d, 113, 113, 0 )
+};
diff --git a/3rdparty/libjpeg/jcapimin.c b/3rdparty/libjpeg/jcapimin.c
index 54fb8c58c5..639ce86f44 100644
--- a/3rdparty/libjpeg/jcapimin.c
+++ b/3rdparty/libjpeg/jcapimin.c
@@ -2,6 +2,7 @@
  * jcapimin.c
  *
  * Copyright (C) 1994-1998, Thomas G. Lane.
+ * Modified 2003-2010 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -63,14 +64,21 @@ jpeg_CreateCompress (j_compress_ptr cinfo, int version, size_t structsize)
 
   cinfo->comp_info = NULL;
 
-  for (i = 0; i < NUM_QUANT_TBLS; i++)
+  for (i = 0; i < NUM_QUANT_TBLS; i++) {
     cinfo->quant_tbl_ptrs[i] = NULL;
+    cinfo->q_scale_factor[i] = 100;
+  }
 
   for (i = 0; i < NUM_HUFF_TBLS; i++) {
     cinfo->dc_huff_tbl_ptrs[i] = NULL;
     cinfo->ac_huff_tbl_ptrs[i] = NULL;
   }
 
+  /* Must do it here for emit_dqt in case jpeg_write_tables is used */
+  cinfo->block_size = DCTSIZE;
+  cinfo->natural_order = jpeg_natural_order;
+  cinfo->lim_Se = DCTSIZE2-1;
+
   cinfo->script_space = NULL;
 
   cinfo->input_gamma = 1.0;	/* in case application forgets */
diff --git a/3rdparty/libjpeg/jcarith.c b/3rdparty/libjpeg/jcarith.c
new file mode 100644
index 0000000000..033f67069e
--- /dev/null
+++ b/3rdparty/libjpeg/jcarith.c
@@ -0,0 +1,937 @@
+/*
+ * jcarith.c
+ *
+ * Developed 1997-2011 by Guido Vollbeding.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file contains portable arithmetic entropy encoding routines for JPEG
+ * (implementing the ISO/IEC IS 10918-1 and CCITT Recommendation ITU-T T.81).
+ *
+ * Both sequential and progressive modes are supported in this single module.
+ *
+ * Suspension is not currently supported in this module.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+
+/* Expanded entropy encoder object for arithmetic encoding. */
+
+typedef struct {
+  struct jpeg_entropy_encoder pub; /* public fields */
+
+  INT32 c; /* C register, base of coding interval, layout as in sec. D.1.3 */
+  INT32 a;               /* A register, normalized size of coding interval */
+  INT32 sc;        /* counter for stacked 0xFF values which might overflow */
+  INT32 zc;          /* counter for pending 0x00 output values which might *
+                          * be discarded at the end ("Pacman" termination) */
+  int ct;  /* bit shift counter, determines when next byte will be written */
+  int buffer;                /* buffer for most recent output byte != 0xFF */
+
+  int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
+  int dc_context[MAX_COMPS_IN_SCAN]; /* context index for DC conditioning */
+
+  unsigned int restarts_to_go;	/* MCUs left in this restart interval */
+  int next_restart_num;		/* next restart number to write (0-7) */
+
+  /* Pointers to statistics areas (these workspaces have image lifespan) */
+  unsigned char * dc_stats[NUM_ARITH_TBLS];
+  unsigned char * ac_stats[NUM_ARITH_TBLS];
+
+  /* Statistics bin for coding with fixed probability 0.5 */
+  unsigned char fixed_bin[4];
+} arith_entropy_encoder;
+
+typedef arith_entropy_encoder * arith_entropy_ptr;
+
+/* The following two definitions specify the allocation chunk size
+ * for the statistics area.
+ * According to sections F.1.4.4.1.3 and F.1.4.4.2, we need at least
+ * 49 statistics bins for DC, and 245 statistics bins for AC coding.
+ *
+ * We use a compact representation with 1 byte per statistics bin,
+ * thus the numbers directly represent byte sizes.
+ * This 1 byte per statistics bin contains the meaning of the MPS
+ * (more probable symbol) in the highest bit (mask 0x80), and the
+ * index into the probability estimation state machine table
+ * in the lower bits (mask 0x7F).
+ */
+
+#define DC_STAT_BINS 64
+#define AC_STAT_BINS 256
+
+/* NOTE: Uncomment the following #define if you want to use the
+ * given formula for calculating the AC conditioning parameter Kx
+ * for spectral selection progressive coding in section G.1.3.2
+ * of the spec (Kx = Kmin + SRL (8 + Se - Kmin) 4).
+ * Although the spec and P&M authors claim that this "has proven
+ * to give good results for 8 bit precision samples", I'm not
+ * convinced yet that this is really beneficial.
+ * Early tests gave only very marginal compression enhancements
+ * (a few - around 5 or so - bytes even for very large files),
+ * which would turn out rather negative if we'd suppress the
+ * DAC (Define Arithmetic Conditioning) marker segments for
+ * the default parameters in the future.
+ * Note that currently the marker writing module emits 12-byte
+ * DAC segments for a full-component scan in a color image.
+ * This is not worth worrying about IMHO. However, since the
+ * spec defines the default values to be used if the tables
+ * are omitted (unlike Huffman tables, which are required
+ * anyway), one might optimize this behaviour in the future,
+ * and then it would be disadvantageous to use custom tables if
+ * they don't provide sufficient gain to exceed the DAC size.
+ *
+ * On the other hand, I'd consider it as a reasonable result
+ * that the conditioning has no significant influence on the
+ * compression performance. This means that the basic
+ * statistical model is already rather stable.
+ *
+ * Thus, at the moment, we use the default conditioning values
+ * anyway, and do not use the custom formula.
+ *
+#define CALCULATE_SPECTRAL_CONDITIONING
+ */
+
+/* IRIGHT_SHIFT is like RIGHT_SHIFT, but works on int rather than INT32.
+ * We assume that int right shift is unsigned if INT32 right shift is,
+ * which should be safe.
+ */
+
+#ifdef RIGHT_SHIFT_IS_UNSIGNED
+#define ISHIFT_TEMPS	int ishift_temp;
+#define IRIGHT_SHIFT(x,shft)  \
+	((ishift_temp = (x)) < 0 ? \
+	 (ishift_temp >> (shft)) | ((~0) << (16-(shft))) : \
+	 (ishift_temp >> (shft)))
+#else
+#define ISHIFT_TEMPS
+#define IRIGHT_SHIFT(x,shft)	((x) >> (shft))
+#endif
+
+
+LOCAL(void)
+emit_byte (int val, j_compress_ptr cinfo)
+/* Write next output byte; we do not support suspension in this module. */
+{
+  struct jpeg_destination_mgr * dest = cinfo->dest;
+
+  *dest->next_output_byte++ = (JOCTET) val;
+  if (--dest->free_in_buffer == 0)
+    if (! (*dest->empty_output_buffer) (cinfo))
+      ERREXIT(cinfo, JERR_CANT_SUSPEND);
+}
+
+
+/*
+ * Finish up at the end of an arithmetic-compressed scan.
+ */
+
+METHODDEF(void)
+finish_pass (j_compress_ptr cinfo)
+{
+  arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy;
+  INT32 temp;
+
+  /* Section D.1.8: Termination of encoding */
+
+  /* Find the e->c in the coding interval with the largest
+   * number of trailing zero bits */
+  if ((temp = (e->a - 1 + e->c) & 0xFFFF0000L) < e->c)
+    e->c = temp + 0x8000L;
+  else
+    e->c = temp;
+  /* Send remaining bytes to output */
+  e->c <<= e->ct;
+  if (e->c & 0xF8000000L) {
+    /* One final overflow has to be handled */
+    if (e->buffer >= 0) {
+      if (e->zc)
+	do emit_byte(0x00, cinfo);
+	while (--e->zc);
+      emit_byte(e->buffer + 1, cinfo);
+      if (e->buffer + 1 == 0xFF)
+	emit_byte(0x00, cinfo);
+    }
+    e->zc += e->sc;  /* carry-over converts stacked 0xFF bytes to 0x00 */
+    e->sc = 0;
+  } else {
+    if (e->buffer == 0)
+      ++e->zc;
+    else if (e->buffer >= 0) {
+      if (e->zc)
+	do emit_byte(0x00, cinfo);
+	while (--e->zc);
+      emit_byte(e->buffer, cinfo);
+    }
+    if (e->sc) {
+      if (e->zc)
+	do emit_byte(0x00, cinfo);
+	while (--e->zc);
+      do {
+	emit_byte(0xFF, cinfo);
+	emit_byte(0x00, cinfo);
+      } while (--e->sc);
+    }
+  }
+  /* Output final bytes only if they are not 0x00 */
+  if (e->c & 0x7FFF800L) {
+    if (e->zc)  /* output final pending zero bytes */
+      do emit_byte(0x00, cinfo);
+      while (--e->zc);
+    emit_byte((e->c >> 19) & 0xFF, cinfo);
+    if (((e->c >> 19) & 0xFF) == 0xFF)
+      emit_byte(0x00, cinfo);
+    if (e->c & 0x7F800L) {
+      emit_byte((e->c >> 11) & 0xFF, cinfo);
+      if (((e->c >> 11) & 0xFF) == 0xFF)
+	emit_byte(0x00, cinfo);
+    }
+  }
+}
+
+
+/*
+ * The core arithmetic encoding routine (common in JPEG and JBIG).
+ * This needs to go as fast as possible.
+ * Machine-dependent optimization facilities
+ * are not utilized in this portable implementation.
+ * However, this code should be fairly efficient and
+ * may be a good base for further optimizations anyway.
+ *
+ * Parameter 'val' to be encoded may be 0 or 1 (binary decision).
+ *
+ * Note: I've added full "Pacman" termination support to the
+ * byte output routines, which is equivalent to the optional
+ * Discard_final_zeros procedure (Figure D.15) in the spec.
+ * Thus, we always produce the shortest possible output
+ * stream compliant to the spec (no trailing zero bytes,
+ * except for FF stuffing).
+ *
+ * I've also introduced a new scheme for accessing
+ * the probability estimation state machine table,
+ * derived from Markus Kuhn's JBIG implementation.
+ */
+
+LOCAL(void)
+arith_encode (j_compress_ptr cinfo, unsigned char *st, int val) 
+{
+  register arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy;
+  register unsigned char nl, nm;
+  register INT32 qe, temp;
+  register int sv;
+
+  /* Fetch values from our compact representation of Table D.3(D.2):
+   * Qe values and probability estimation state machine
+   */
+  sv = *st;
+  qe = jpeg_aritab[sv & 0x7F];	/* => Qe_Value */
+  nl = qe & 0xFF; qe >>= 8;	/* Next_Index_LPS + Switch_MPS */
+  nm = qe & 0xFF; qe >>= 8;	/* Next_Index_MPS */
+
+  /* Encode & estimation procedures per sections D.1.4 & D.1.5 */
+  e->a -= qe;
+  if (val != (sv >> 7)) {
+    /* Encode the less probable symbol */
+    if (e->a >= qe) {
+      /* If the interval size (qe) for the less probable symbol (LPS)
+       * is larger than the interval size for the MPS, then exchange
+       * the two symbols for coding efficiency, otherwise code the LPS
+       * as usual: */
+      e->c += e->a;
+      e->a = qe;
+    }
+    *st = (sv & 0x80) ^ nl;	/* Estimate_after_LPS */
+  } else {
+    /* Encode the more probable symbol */
+    if (e->a >= 0x8000L)
+      return;  /* A >= 0x8000 -> ready, no renormalization required */
+    if (e->a < qe) {
+      /* If the interval size (qe) for the less probable symbol (LPS)
+       * is larger than the interval size for the MPS, then exchange
+       * the two symbols for coding efficiency: */
+      e->c += e->a;
+      e->a = qe;
+    }
+    *st = (sv & 0x80) ^ nm;	/* Estimate_after_MPS */
+  }
+
+  /* Renormalization & data output per section D.1.6 */
+  do {
+    e->a <<= 1;
+    e->c <<= 1;
+    if (--e->ct == 0) {
+      /* Another byte is ready for output */
+      temp = e->c >> 19;
+      if (temp > 0xFF) {
+	/* Handle overflow over all stacked 0xFF bytes */
+	if (e->buffer >= 0) {
+	  if (e->zc)
+	    do emit_byte(0x00, cinfo);
+	    while (--e->zc);
+	  emit_byte(e->buffer + 1, cinfo);
+	  if (e->buffer + 1 == 0xFF)
+	    emit_byte(0x00, cinfo);
+	}
+	e->zc += e->sc;  /* carry-over converts stacked 0xFF bytes to 0x00 */
+	e->sc = 0;
+	/* Note: The 3 spacer bits in the C register guarantee
+	 * that the new buffer byte can't be 0xFF here
+	 * (see page 160 in the P&M JPEG book). */
+	e->buffer = temp & 0xFF;  /* new output byte, might overflow later */
+      } else if (temp == 0xFF) {
+	++e->sc;  /* stack 0xFF byte (which might overflow later) */
+      } else {
+	/* Output all stacked 0xFF bytes, they will not overflow any more */
+	if (e->buffer == 0)
+	  ++e->zc;
+	else if (e->buffer >= 0) {
+	  if (e->zc)
+	    do emit_byte(0x00, cinfo);
+	    while (--e->zc);
+	  emit_byte(e->buffer, cinfo);
+	}
+	if (e->sc) {
+	  if (e->zc)
+	    do emit_byte(0x00, cinfo);
+	    while (--e->zc);
+	  do {
+	    emit_byte(0xFF, cinfo);
+	    emit_byte(0x00, cinfo);
+	  } while (--e->sc);
+	}
+	e->buffer = temp & 0xFF;  /* new output byte (can still overflow) */
+      }
+      e->c &= 0x7FFFFL;
+      e->ct += 8;
+    }
+  } while (e->a < 0x8000L);
+}
+
+
+/*
+ * Emit a restart marker & resynchronize predictions.
+ */
+
+LOCAL(void)
+emit_restart (j_compress_ptr cinfo, int restart_num)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  int ci;
+  jpeg_component_info * compptr;
+
+  finish_pass(cinfo);
+
+  emit_byte(0xFF, cinfo);
+  emit_byte(JPEG_RST0 + restart_num, cinfo);
+
+  /* Re-initialize statistics areas */
+  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+    compptr = cinfo->cur_comp_info[ci];
+    /* DC needs no table for refinement scan */
+    if (cinfo->Ss == 0 && cinfo->Ah == 0) {
+      MEMZERO(entropy->dc_stats[compptr->dc_tbl_no], DC_STAT_BINS);
+      /* Reset DC predictions to 0 */
+      entropy->last_dc_val[ci] = 0;
+      entropy->dc_context[ci] = 0;
+    }
+    /* AC needs no table when not present */
+    if (cinfo->Se) {
+      MEMZERO(entropy->ac_stats[compptr->ac_tbl_no], AC_STAT_BINS);
+    }
+  }
+
+  /* Reset arithmetic encoding variables */
+  entropy->c = 0;
+  entropy->a = 0x10000L;
+  entropy->sc = 0;
+  entropy->zc = 0;
+  entropy->ct = 11;
+  entropy->buffer = -1;  /* empty */
+}
+
+
+/*
+ * MCU encoding for DC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  JBLOCKROW block;
+  unsigned char *st;
+  int blkn, ci, tbl;
+  int v, v2, m;
+  ISHIFT_TEMPS
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      emit_restart(cinfo, entropy->next_restart_num);
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  /* Encode the MCU data blocks */
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    block = MCU_data[blkn];
+    ci = cinfo->MCU_membership[blkn];
+    tbl = cinfo->cur_comp_info[ci]->dc_tbl_no;
+
+    /* Compute the DC value after the required point transform by Al.
+     * This is simply an arithmetic right shift.
+     */
+    m = IRIGHT_SHIFT((int) ((*block)[0]), cinfo->Al);
+
+    /* Sections F.1.4.1 & F.1.4.4.1: Encoding of DC coefficients */
+
+    /* Table F.4: Point to statistics bin S0 for DC coefficient coding */
+    st = entropy->dc_stats[tbl] + entropy->dc_context[ci];
+
+    /* Figure F.4: Encode_DC_DIFF */
+    if ((v = m - entropy->last_dc_val[ci]) == 0) {
+      arith_encode(cinfo, st, 0);
+      entropy->dc_context[ci] = 0;	/* zero diff category */
+    } else {
+      entropy->last_dc_val[ci] = m;
+      arith_encode(cinfo, st, 1);
+      /* Figure F.6: Encoding nonzero value v */
+      /* Figure F.7: Encoding the sign of v */
+      if (v > 0) {
+	arith_encode(cinfo, st + 1, 0);	/* Table F.4: SS = S0 + 1 */
+	st += 2;			/* Table F.4: SP = S0 + 2 */
+	entropy->dc_context[ci] = 4;	/* small positive diff category */
+      } else {
+	v = -v;
+	arith_encode(cinfo, st + 1, 1);	/* Table F.4: SS = S0 + 1 */
+	st += 3;			/* Table F.4: SN = S0 + 3 */
+	entropy->dc_context[ci] = 8;	/* small negative diff category */
+      }
+      /* Figure F.8: Encoding the magnitude category of v */
+      m = 0;
+      if (v -= 1) {
+	arith_encode(cinfo, st, 1);
+	m = 1;
+	v2 = v;
+	st = entropy->dc_stats[tbl] + 20; /* Table F.4: X1 = 20 */
+	while (v2 >>= 1) {
+	  arith_encode(cinfo, st, 1);
+	  m <<= 1;
+	  st += 1;
+	}
+      }
+      arith_encode(cinfo, st, 0);
+      /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
+      if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+	entropy->dc_context[ci] = 0;	/* zero diff category */
+      else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+	entropy->dc_context[ci] += 8;	/* large diff category */
+      /* Figure F.9: Encoding the magnitude bit pattern of v */
+      st += 14;
+      while (m >>= 1)
+	arith_encode(cinfo, st, (m & v) ? 1 : 0);
+    }
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU encoding for AC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  JBLOCKROW block;
+  unsigned char *st;
+  int tbl, k, ke;
+  int v, v2, m;
+  const int * natural_order;
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      emit_restart(cinfo, entropy->next_restart_num);
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  natural_order = cinfo->natural_order;
+
+  /* Encode the MCU data block */
+  block = MCU_data[0];
+  tbl = cinfo->cur_comp_info[0]->ac_tbl_no;
+
+  /* Sections F.1.4.2 & F.1.4.4.2: Encoding of AC coefficients */
+
+  /* Establish EOB (end-of-block) index */
+  for (ke = cinfo->Se; ke > 0; ke--)
+    /* We must apply the point transform by Al.  For AC coefficients this
+     * is an integer division with rounding towards 0.  To do this portably
+     * in C, we shift after obtaining the absolute value.
+     */
+    if ((v = (*block)[natural_order[ke]]) >= 0) {
+      if (v >>= cinfo->Al) break;
+    } else {
+      v = -v;
+      if (v >>= cinfo->Al) break;
+    }
+
+  /* Figure F.5: Encode_AC_Coefficients */
+  for (k = cinfo->Ss; k <= ke; k++) {
+    st = entropy->ac_stats[tbl] + 3 * (k - 1);
+    arith_encode(cinfo, st, 0);		/* EOB decision */
+    for (;;) {
+      if ((v = (*block)[natural_order[k]]) >= 0) {
+	if (v >>= cinfo->Al) {
+	  arith_encode(cinfo, st + 1, 1);
+	  arith_encode(cinfo, entropy->fixed_bin, 0);
+	  break;
+	}
+      } else {
+	v = -v;
+	if (v >>= cinfo->Al) {
+	  arith_encode(cinfo, st + 1, 1);
+	  arith_encode(cinfo, entropy->fixed_bin, 1);
+	  break;
+	}
+      }
+      arith_encode(cinfo, st + 1, 0); st += 3; k++;
+    }
+    st += 2;
+    /* Figure F.8: Encoding the magnitude category of v */
+    m = 0;
+    if (v -= 1) {
+      arith_encode(cinfo, st, 1);
+      m = 1;
+      v2 = v;
+      if (v2 >>= 1) {
+	arith_encode(cinfo, st, 1);
+	m <<= 1;
+	st = entropy->ac_stats[tbl] +
+	     (k <= cinfo->arith_ac_K[tbl] ? 189 : 217);
+	while (v2 >>= 1) {
+	  arith_encode(cinfo, st, 1);
+	  m <<= 1;
+	  st += 1;
+	}
+      }
+    }
+    arith_encode(cinfo, st, 0);
+    /* Figure F.9: Encoding the magnitude bit pattern of v */
+    st += 14;
+    while (m >>= 1)
+      arith_encode(cinfo, st, (m & v) ? 1 : 0);
+  }
+  /* Encode EOB decision only if k <= cinfo->Se */
+  if (k <= cinfo->Se) {
+    st = entropy->ac_stats[tbl] + 3 * (k - 1);
+    arith_encode(cinfo, st, 1);
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU encoding for DC successive approximation refinement scan.
+ */
+
+METHODDEF(boolean)
+encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  unsigned char *st;
+  int Al, blkn;
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      emit_restart(cinfo, entropy->next_restart_num);
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  st = entropy->fixed_bin;	/* use fixed probability estimation */
+  Al = cinfo->Al;
+
+  /* Encode the MCU data blocks */
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    /* We simply emit the Al'th bit of the DC coefficient value. */
+    arith_encode(cinfo, st, (MCU_data[blkn][0][0] >> Al) & 1);
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU encoding for AC successive approximation refinement scan.
+ */
+
+METHODDEF(boolean)
+encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  JBLOCKROW block;
+  unsigned char *st;
+  int tbl, k, ke, kex;
+  int v;
+  const int * natural_order;
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      emit_restart(cinfo, entropy->next_restart_num);
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  natural_order = cinfo->natural_order;
+
+  /* Encode the MCU data block */
+  block = MCU_data[0];
+  tbl = cinfo->cur_comp_info[0]->ac_tbl_no;
+
+  /* Section G.1.3.3: Encoding of AC coefficients */
+
+  /* Establish EOB (end-of-block) index */
+  for (ke = cinfo->Se; ke > 0; ke--)
+    /* We must apply the point transform by Al.  For AC coefficients this
+     * is an integer division with rounding towards 0.  To do this portably
+     * in C, we shift after obtaining the absolute value.
+     */
+    if ((v = (*block)[natural_order[ke]]) >= 0) {
+      if (v >>= cinfo->Al) break;
+    } else {
+      v = -v;
+      if (v >>= cinfo->Al) break;
+    }
+
+  /* Establish EOBx (previous stage end-of-block) index */
+  for (kex = ke; kex > 0; kex--)
+    if ((v = (*block)[natural_order[kex]]) >= 0) {
+      if (v >>= cinfo->Ah) break;
+    } else {
+      v = -v;
+      if (v >>= cinfo->Ah) break;
+    }
+
+  /* Figure G.10: Encode_AC_Coefficients_SA */
+  for (k = cinfo->Ss; k <= ke; k++) {
+    st = entropy->ac_stats[tbl] + 3 * (k - 1);
+    if (k > kex)
+      arith_encode(cinfo, st, 0);	/* EOB decision */
+    for (;;) {
+      if ((v = (*block)[natural_order[k]]) >= 0) {
+	if (v >>= cinfo->Al) {
+	  if (v >> 1)			/* previously nonzero coef */
+	    arith_encode(cinfo, st + 2, (v & 1));
+	  else {			/* newly nonzero coef */
+	    arith_encode(cinfo, st + 1, 1);
+	    arith_encode(cinfo, entropy->fixed_bin, 0);
+	  }
+	  break;
+	}
+      } else {
+	v = -v;
+	if (v >>= cinfo->Al) {
+	  if (v >> 1)			/* previously nonzero coef */
+	    arith_encode(cinfo, st + 2, (v & 1));
+	  else {			/* newly nonzero coef */
+	    arith_encode(cinfo, st + 1, 1);
+	    arith_encode(cinfo, entropy->fixed_bin, 1);
+	  }
+	  break;
+	}
+      }
+      arith_encode(cinfo, st + 1, 0); st += 3; k++;
+    }
+  }
+  /* Encode EOB decision only if k <= cinfo->Se */
+  if (k <= cinfo->Se) {
+    st = entropy->ac_stats[tbl] + 3 * (k - 1);
+    arith_encode(cinfo, st, 1);
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * Encode and output one MCU's worth of arithmetic-compressed coefficients.
+ */
+
+METHODDEF(boolean)
+encode_mcu (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  jpeg_component_info * compptr;
+  JBLOCKROW block;
+  unsigned char *st;
+  int blkn, ci, tbl, k, ke;
+  int v, v2, m;
+  const int * natural_order;
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      emit_restart(cinfo, entropy->next_restart_num);
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  natural_order = cinfo->natural_order;
+
+  /* Encode the MCU data blocks */
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    block = MCU_data[blkn];
+    ci = cinfo->MCU_membership[blkn];
+    compptr = cinfo->cur_comp_info[ci];
+
+    /* Sections F.1.4.1 & F.1.4.4.1: Encoding of DC coefficients */
+
+    tbl = compptr->dc_tbl_no;
+
+    /* Table F.4: Point to statistics bin S0 for DC coefficient coding */
+    st = entropy->dc_stats[tbl] + entropy->dc_context[ci];
+
+    /* Figure F.4: Encode_DC_DIFF */
+    if ((v = (*block)[0] - entropy->last_dc_val[ci]) == 0) {
+      arith_encode(cinfo, st, 0);
+      entropy->dc_context[ci] = 0;	/* zero diff category */
+    } else {
+      entropy->last_dc_val[ci] = (*block)[0];
+      arith_encode(cinfo, st, 1);
+      /* Figure F.6: Encoding nonzero value v */
+      /* Figure F.7: Encoding the sign of v */
+      if (v > 0) {
+	arith_encode(cinfo, st + 1, 0);	/* Table F.4: SS = S0 + 1 */
+	st += 2;			/* Table F.4: SP = S0 + 2 */
+	entropy->dc_context[ci] = 4;	/* small positive diff category */
+      } else {
+	v = -v;
+	arith_encode(cinfo, st + 1, 1);	/* Table F.4: SS = S0 + 1 */
+	st += 3;			/* Table F.4: SN = S0 + 3 */
+	entropy->dc_context[ci] = 8;	/* small negative diff category */
+      }
+      /* Figure F.8: Encoding the magnitude category of v */
+      m = 0;
+      if (v -= 1) {
+	arith_encode(cinfo, st, 1);
+	m = 1;
+	v2 = v;
+	st = entropy->dc_stats[tbl] + 20; /* Table F.4: X1 = 20 */
+	while (v2 >>= 1) {
+	  arith_encode(cinfo, st, 1);
+	  m <<= 1;
+	  st += 1;
+	}
+      }
+      arith_encode(cinfo, st, 0);
+      /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
+      if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+	entropy->dc_context[ci] = 0;	/* zero diff category */
+      else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+	entropy->dc_context[ci] += 8;	/* large diff category */
+      /* Figure F.9: Encoding the magnitude bit pattern of v */
+      st += 14;
+      while (m >>= 1)
+	arith_encode(cinfo, st, (m & v) ? 1 : 0);
+    }
+
+    /* Sections F.1.4.2 & F.1.4.4.2: Encoding of AC coefficients */
+
+    if ((ke = cinfo->lim_Se) == 0) continue;
+    tbl = compptr->ac_tbl_no;
+
+    /* Establish EOB (end-of-block) index */
+    do {
+      if ((*block)[natural_order[ke]]) break;
+    } while (--ke);
+
+    /* Figure F.5: Encode_AC_Coefficients */
+    for (k = 0; k < ke;) {
+      st = entropy->ac_stats[tbl] + 3 * k;
+      arith_encode(cinfo, st, 0);	/* EOB decision */
+      while ((v = (*block)[natural_order[++k]]) == 0) {
+	arith_encode(cinfo, st + 1, 0);
+	st += 3;
+      }
+      arith_encode(cinfo, st + 1, 1);
+      /* Figure F.6: Encoding nonzero value v */
+      /* Figure F.7: Encoding the sign of v */
+      if (v > 0) {
+	arith_encode(cinfo, entropy->fixed_bin, 0);
+      } else {
+	v = -v;
+	arith_encode(cinfo, entropy->fixed_bin, 1);
+      }
+      st += 2;
+      /* Figure F.8: Encoding the magnitude category of v */
+      m = 0;
+      if (v -= 1) {
+	arith_encode(cinfo, st, 1);
+	m = 1;
+	v2 = v;
+	if (v2 >>= 1) {
+	  arith_encode(cinfo, st, 1);
+	  m <<= 1;
+	  st = entropy->ac_stats[tbl] +
+	       (k <= cinfo->arith_ac_K[tbl] ? 189 : 217);
+	  while (v2 >>= 1) {
+	    arith_encode(cinfo, st, 1);
+	    m <<= 1;
+	    st += 1;
+	  }
+	}
+      }
+      arith_encode(cinfo, st, 0);
+      /* Figure F.9: Encoding the magnitude bit pattern of v */
+      st += 14;
+      while (m >>= 1)
+	arith_encode(cinfo, st, (m & v) ? 1 : 0);
+    }
+    /* Encode EOB decision only if k < cinfo->lim_Se */
+    if (k < cinfo->lim_Se) {
+      st = entropy->ac_stats[tbl] + 3 * k;
+      arith_encode(cinfo, st, 1);
+    }
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * Initialize for an arithmetic-compressed scan.
+ */
+
+METHODDEF(void)
+start_pass (j_compress_ptr cinfo, boolean gather_statistics)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  int ci, tbl;
+  jpeg_component_info * compptr;
+
+  if (gather_statistics)
+    /* Make sure to avoid that in the master control logic!
+     * We are fully adaptive here and need no extra
+     * statistics gathering pass!
+     */
+    ERREXIT(cinfo, JERR_NOT_COMPILED);
+
+  /* We assume jcmaster.c already validated the progressive scan parameters. */
+
+  /* Select execution routines */
+  if (cinfo->progressive_mode) {
+    if (cinfo->Ah == 0) {
+      if (cinfo->Ss == 0)
+	entropy->pub.encode_mcu = encode_mcu_DC_first;
+      else
+	entropy->pub.encode_mcu = encode_mcu_AC_first;
+    } else {
+      if (cinfo->Ss == 0)
+	entropy->pub.encode_mcu = encode_mcu_DC_refine;
+      else
+	entropy->pub.encode_mcu = encode_mcu_AC_refine;
+    }
+  } else
+    entropy->pub.encode_mcu = encode_mcu;
+
+  /* Allocate & initialize requested statistics areas */
+  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+    compptr = cinfo->cur_comp_info[ci];
+    /* DC needs no table for refinement scan */
+    if (cinfo->Ss == 0 && cinfo->Ah == 0) {
+      tbl = compptr->dc_tbl_no;
+      if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
+	ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
+      if (entropy->dc_stats[tbl] == NULL)
+	entropy->dc_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
+	  ((j_common_ptr) cinfo, JPOOL_IMAGE, DC_STAT_BINS);
+      MEMZERO(entropy->dc_stats[tbl], DC_STAT_BINS);
+      /* Initialize DC predictions to 0 */
+      entropy->last_dc_val[ci] = 0;
+      entropy->dc_context[ci] = 0;
+    }
+    /* AC needs no table when not present */
+    if (cinfo->Se) {
+      tbl = compptr->ac_tbl_no;
+      if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
+	ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
+      if (entropy->ac_stats[tbl] == NULL)
+	entropy->ac_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
+	  ((j_common_ptr) cinfo, JPOOL_IMAGE, AC_STAT_BINS);
+      MEMZERO(entropy->ac_stats[tbl], AC_STAT_BINS);
+#ifdef CALCULATE_SPECTRAL_CONDITIONING
+      if (cinfo->progressive_mode)
+	/* Section G.1.3.2: Set appropriate arithmetic conditioning value Kx */
+	cinfo->arith_ac_K[tbl] = cinfo->Ss + ((8 + cinfo->Se - cinfo->Ss) >> 4);
+#endif
+    }
+  }
+
+  /* Initialize arithmetic encoding variables */
+  entropy->c = 0;
+  entropy->a = 0x10000L;
+  entropy->sc = 0;
+  entropy->zc = 0;
+  entropy->ct = 11;
+  entropy->buffer = -1;  /* empty */
+
+  /* Initialize restart stuff */
+  entropy->restarts_to_go = cinfo->restart_interval;
+  entropy->next_restart_num = 0;
+}
+
+
+/*
+ * Module initialization routine for arithmetic entropy encoding.
+ */
+
+GLOBAL(void)
+jinit_arith_encoder (j_compress_ptr cinfo)
+{
+  arith_entropy_ptr entropy;
+  int i;
+
+  entropy = (arith_entropy_ptr)
+    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+				SIZEOF(arith_entropy_encoder));
+  cinfo->entropy = (struct jpeg_entropy_encoder *) entropy;
+  entropy->pub.start_pass = start_pass;
+  entropy->pub.finish_pass = finish_pass;
+
+  /* Mark tables unallocated */
+  for (i = 0; i < NUM_ARITH_TBLS; i++) {
+    entropy->dc_stats[i] = NULL;
+    entropy->ac_stats[i] = NULL;
+  }
+
+  /* Initialize index for fixed probability estimation */
+  entropy->fixed_bin[0] = 113;
+}
diff --git a/3rdparty/libjpeg/jccoefct.c b/3rdparty/libjpeg/jccoefct.c
index 1963ddb61b..924a703dda 100644
--- a/3rdparty/libjpeg/jccoefct.c
+++ b/3rdparty/libjpeg/jccoefct.c
@@ -2,6 +2,7 @@
  * jccoefct.c
  *
  * Copyright (C) 1994-1997, Thomas G. Lane.
+ * Modified 2003-2011 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -149,6 +150,7 @@ compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
   int blkn, bi, ci, yindex, yoffset, blockcnt;
   JDIMENSION ypos, xpos;
   jpeg_component_info *compptr;
+  forward_DCT_ptr forward_DCT;
 
   /* Loop to write as much as one whole iMCU row */
   for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row;
@@ -167,35 +169,37 @@ compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
       blkn = 0;
       for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
 	compptr = cinfo->cur_comp_info[ci];
+	forward_DCT = cinfo->fdct->forward_DCT[compptr->component_index];
 	blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
 						: compptr->last_col_width;
 	xpos = MCU_col_num * compptr->MCU_sample_width;
-	ypos = yoffset * DCTSIZE; /* ypos == (yoffset+yindex) * DCTSIZE */
+	ypos = yoffset * compptr->DCT_v_scaled_size;
+	/* ypos == (yoffset+yindex) * DCTSIZE */
 	for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
 	  if (coef->iMCU_row_num < last_iMCU_row ||
 	      yoffset+yindex < compptr->last_row_height) {
-	    (*cinfo->fdct->forward_DCT) (cinfo, compptr,
-					 input_buf[compptr->component_index],
-					 coef->MCU_buffer[blkn],
-					 ypos, xpos, (JDIMENSION) blockcnt);
+	    (*forward_DCT) (cinfo, compptr,
+			    input_buf[compptr->component_index],
+			    coef->MCU_buffer[blkn],
+			    ypos, xpos, (JDIMENSION) blockcnt);
 	    if (blockcnt < compptr->MCU_width) {
 	      /* Create some dummy blocks at the right edge of the image. */
-	      jzero_far((void FAR *) coef->MCU_buffer[blkn + blockcnt],
-			(compptr->MCU_width - blockcnt) * SIZEOF(JBLOCK));
+	      FMEMZERO((void FAR *) coef->MCU_buffer[blkn + blockcnt],
+		       (compptr->MCU_width - blockcnt) * SIZEOF(JBLOCK));
 	      for (bi = blockcnt; bi < compptr->MCU_width; bi++) {
 		coef->MCU_buffer[blkn+bi][0][0] = coef->MCU_buffer[blkn+bi-1][0][0];
 	      }
 	    }
 	  } else {
 	    /* Create a row of dummy blocks at the bottom of the image. */
-	    jzero_far((void FAR *) coef->MCU_buffer[blkn],
-		      compptr->MCU_width * SIZEOF(JBLOCK));
+	    FMEMZERO((void FAR *) coef->MCU_buffer[blkn],
+		     compptr->MCU_width * SIZEOF(JBLOCK));
 	    for (bi = 0; bi < compptr->MCU_width; bi++) {
 	      coef->MCU_buffer[blkn+bi][0][0] = coef->MCU_buffer[blkn-1][0][0];
 	    }
 	  }
 	  blkn += compptr->MCU_width;
-	  ypos += DCTSIZE;
+	  ypos += compptr->DCT_v_scaled_size;
 	}
       }
       /* Try to write the MCU.  In event of a suspension failure, we will
@@ -252,6 +256,7 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
   jpeg_component_info *compptr;
   JBLOCKARRAY buffer;
   JBLOCKROW thisblockrow, lastblockrow;
+  forward_DCT_ptr forward_DCT;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
@@ -274,19 +279,19 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
     ndummy = (int) (blocks_across % h_samp_factor);
     if (ndummy > 0)
       ndummy = h_samp_factor - ndummy;
+    forward_DCT = cinfo->fdct->forward_DCT[ci];
     /* Perform DCT for all non-dummy blocks in this iMCU row.  Each call
      * on forward_DCT processes a complete horizontal row of DCT blocks.
      */
     for (block_row = 0; block_row < block_rows; block_row++) {
       thisblockrow = buffer[block_row];
-      (*cinfo->fdct->forward_DCT) (cinfo, compptr,
-				   input_buf[ci], thisblockrow,
-				   (JDIMENSION) (block_row * DCTSIZE),
-				   (JDIMENSION) 0, blocks_across);
+      (*forward_DCT) (cinfo, compptr, input_buf[ci], thisblockrow,
+		      (JDIMENSION) (block_row * compptr->DCT_v_scaled_size),
+		      (JDIMENSION) 0, blocks_across);
       if (ndummy > 0) {
 	/* Create dummy blocks at the right edge of the image. */
 	thisblockrow += blocks_across; /* => first dummy block */
-	jzero_far((void FAR *) thisblockrow, ndummy * SIZEOF(JBLOCK));
+	FMEMZERO((void FAR *) thisblockrow, ndummy * SIZEOF(JBLOCK));
 	lastDC = thisblockrow[-1][0];
 	for (bi = 0; bi < ndummy; bi++) {
 	  thisblockrow[bi][0] = lastDC;
@@ -305,8 +310,8 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
 	   block_row++) {
 	thisblockrow = buffer[block_row];
 	lastblockrow = buffer[block_row-1];
-	jzero_far((void FAR *) thisblockrow,
-		  (size_t) (blocks_across * SIZEOF(JBLOCK)));
+	FMEMZERO((void FAR *) thisblockrow,
+		 (size_t) (blocks_across * SIZEOF(JBLOCK)));
 	for (MCUindex = 0; MCUindex < MCUs_across; MCUindex++) {
 	  lastDC = lastblockrow[h_samp_factor-1][0];
 	  for (bi = 0; bi < h_samp_factor; bi++) {
diff --git a/3rdparty/libjpeg/jccolor.c b/3rdparty/libjpeg/jccolor.c
index 0a8a4b5d13..3e2d0e9277 100644
--- a/3rdparty/libjpeg/jccolor.c
+++ b/3rdparty/libjpeg/jccolor.c
@@ -2,6 +2,7 @@
  * jccolor.c
  *
  * Copyright (C) 1991-1996, Thomas G. Lane.
+ * Modified 2011 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -299,6 +300,39 @@ grayscale_convert (j_compress_ptr cinfo,
 }
 
 
+/*
+ * Convert some rows of samples to the JPEG colorspace.
+ * No colorspace conversion, but change from interleaved
+ * to separate-planes representation.
+ */
+
+METHODDEF(void)
+rgb_convert (j_compress_ptr cinfo,
+	     JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+	     JDIMENSION output_row, int num_rows)
+{
+  register JSAMPROW inptr;
+  register JSAMPROW outptr0, outptr1, outptr2;
+  register JDIMENSION col;
+  JDIMENSION num_cols = cinfo->image_width;
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr0 = output_buf[0][output_row];
+    outptr1 = output_buf[1][output_row];
+    outptr2 = output_buf[2][output_row];
+    output_row++;
+    for (col = 0; col < num_cols; col++) {
+      /* We can dispense with GETJSAMPLE() here */
+      outptr0[col] = inptr[RGB_RED];
+      outptr1[col] = inptr[RGB_GREEN];
+      outptr2[col] = inptr[RGB_BLUE];
+      inptr += RGB_PIXELSIZE;
+    }
+  }
+}
+
+
 /*
  * Convert some rows of samples to the JPEG colorspace.
  * This version handles multi-component colorspaces without conversion.
@@ -368,11 +402,9 @@ jinit_color_converter (j_compress_ptr cinfo)
     break;
 
   case JCS_RGB:
-#if RGB_PIXELSIZE != 3
     if (cinfo->input_components != RGB_PIXELSIZE)
       ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
     break;
-#endif /* else share code with YCbCr */
 
   case JCS_YCbCr:
     if (cinfo->input_components != 3)
@@ -396,22 +428,21 @@ jinit_color_converter (j_compress_ptr cinfo)
   case JCS_GRAYSCALE:
     if (cinfo->num_components != 1)
       ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
-    if (cinfo->in_color_space == JCS_GRAYSCALE)
+    if (cinfo->in_color_space == JCS_GRAYSCALE ||
+	cinfo->in_color_space == JCS_YCbCr)
       cconvert->pub.color_convert = grayscale_convert;
     else if (cinfo->in_color_space == JCS_RGB) {
       cconvert->pub.start_pass = rgb_ycc_start;
       cconvert->pub.color_convert = rgb_gray_convert;
-    } else if (cinfo->in_color_space == JCS_YCbCr)
-      cconvert->pub.color_convert = grayscale_convert;
-    else
+    } else
       ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
     break;
 
   case JCS_RGB:
     if (cinfo->num_components != 3)
       ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
-    if (cinfo->in_color_space == JCS_RGB && RGB_PIXELSIZE == 3)
-      cconvert->pub.color_convert = null_convert;
+    if (cinfo->in_color_space == JCS_RGB)
+      cconvert->pub.color_convert = rgb_convert;
     else
       ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
     break;
diff --git a/3rdparty/libjpeg/jcdctmgr.c b/3rdparty/libjpeg/jcdctmgr.c
index 61fa79b9e6..0bbdbb685d 100644
--- a/3rdparty/libjpeg/jcdctmgr.c
+++ b/3rdparty/libjpeg/jcdctmgr.c
@@ -23,7 +23,7 @@ typedef struct {
   struct jpeg_forward_dct pub;	/* public fields */
 
   /* Pointer to the DCT routine actually in use */
-  forward_DCT_method_ptr do_dct;
+  forward_DCT_method_ptr do_dct[MAX_COMPONENTS];
 
   /* The actual post-DCT divisors --- not identical to the quant table
    * entries, because of scaling (especially for an unnormalized DCT).
@@ -33,7 +33,7 @@ typedef struct {
 
 #ifdef DCT_FLOAT_SUPPORTED
   /* Same as above for the floating-point case. */
-  float_DCT_method_ptr do_float_dct;
+  float_DCT_method_ptr do_float_dct[MAX_COMPONENTS];
   FAST_FLOAT * float_divisors[NUM_QUANT_TBLS];
 #endif
 } my_fdct_controller;
@@ -41,6 +41,132 @@ typedef struct {
 typedef my_fdct_controller * my_fdct_ptr;
 
 
+/* The current scaled-DCT routines require ISLOW-style divisor tables,
+ * so be sure to compile that code if either ISLOW or SCALING is requested.
+ */
+#ifdef DCT_ISLOW_SUPPORTED
+#define PROVIDE_ISLOW_TABLES
+#else
+#ifdef DCT_SCALING_SUPPORTED
+#define PROVIDE_ISLOW_TABLES
+#endif
+#endif
+
+
+/*
+ * Perform forward DCT on one or more blocks of a component.
+ *
+ * The input samples are taken from the sample_data[] array starting at
+ * position start_row/start_col, and moving to the right for any additional
+ * blocks. The quantized coefficients are returned in coef_blocks[].
+ */
+
+METHODDEF(void)
+forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr,
+	     JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
+	     JDIMENSION start_row, JDIMENSION start_col,
+	     JDIMENSION num_blocks)
+/* This version is used for integer DCT implementations. */
+{
+  /* This routine is heavily used, so it's worth coding it tightly. */
+  my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
+  forward_DCT_method_ptr do_dct = fdct->do_dct[compptr->component_index];
+  DCTELEM * divisors = fdct->divisors[compptr->quant_tbl_no];
+  DCTELEM workspace[DCTSIZE2];	/* work area for FDCT subroutine */
+  JDIMENSION bi;
+
+  sample_data += start_row;	/* fold in the vertical offset once */
+
+  for (bi = 0; bi < num_blocks; bi++, start_col += compptr->DCT_h_scaled_size) {
+    /* Perform the DCT */
+    (*do_dct) (workspace, sample_data, start_col);
+
+    /* Quantize/descale the coefficients, and store into coef_blocks[] */
+    { register DCTELEM temp, qval;
+      register int i;
+      register JCOEFPTR output_ptr = coef_blocks[bi];
+
+      for (i = 0; i < DCTSIZE2; i++) {
+	qval = divisors[i];
+	temp = workspace[i];
+	/* Divide the coefficient value by qval, ensuring proper rounding.
+	 * Since C does not specify the direction of rounding for negative
+	 * quotients, we have to force the dividend positive for portability.
+	 *
+	 * In most files, at least half of the output values will be zero
+	 * (at default quantization settings, more like three-quarters...)
+	 * so we should ensure that this case is fast.  On many machines,
+	 * a comparison is enough cheaper than a divide to make a special test
+	 * a win.  Since both inputs will be nonnegative, we need only test
+	 * for a < b to discover whether a/b is 0.
+	 * If your machine's division is fast enough, define FAST_DIVIDE.
+	 */
+#ifdef FAST_DIVIDE
+#define DIVIDE_BY(a,b)	a /= b
+#else
+#define DIVIDE_BY(a,b)	if (a >= b) a /= b; else a = 0
+#endif
+	if (temp < 0) {
+	  temp = -temp;
+	  temp += qval>>1;	/* for rounding */
+	  DIVIDE_BY(temp, qval);
+	  temp = -temp;
+	} else {
+	  temp += qval>>1;	/* for rounding */
+	  DIVIDE_BY(temp, qval);
+	}
+	output_ptr[i] = (JCOEF) temp;
+      }
+    }
+  }
+}
+
+
+#ifdef DCT_FLOAT_SUPPORTED
+
+METHODDEF(void)
+forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,
+		   JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
+		   JDIMENSION start_row, JDIMENSION start_col,
+		   JDIMENSION num_blocks)
+/* This version is used for floating-point DCT implementations. */
+{
+  /* This routine is heavily used, so it's worth coding it tightly. */
+  my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
+  float_DCT_method_ptr do_dct = fdct->do_float_dct[compptr->component_index];
+  FAST_FLOAT * divisors = fdct->float_divisors[compptr->quant_tbl_no];
+  FAST_FLOAT workspace[DCTSIZE2]; /* work area for FDCT subroutine */
+  JDIMENSION bi;
+
+  sample_data += start_row;	/* fold in the vertical offset once */
+
+  for (bi = 0; bi < num_blocks; bi++, start_col += compptr->DCT_h_scaled_size) {
+    /* Perform the DCT */
+    (*do_dct) (workspace, sample_data, start_col);
+
+    /* Quantize/descale the coefficients, and store into coef_blocks[] */
+    { register FAST_FLOAT temp;
+      register int i;
+      register JCOEFPTR output_ptr = coef_blocks[bi];
+
+      for (i = 0; i < DCTSIZE2; i++) {
+	/* Apply the quantization and scaling factor */
+	temp = workspace[i] * divisors[i];
+	/* Round to nearest integer.
+	 * Since C does not specify the direction of rounding for negative
+	 * quotients, we have to force the dividend positive for portability.
+	 * The maximum coefficient size is +-16K (for 12-bit data), so this
+	 * code should work for either 16-bit or 32-bit ints.
+	 */
+	output_ptr[i] = (JCOEF) ((int) (temp + (FAST_FLOAT) 16384.5) - 16384);
+      }
+    }
+  }
+}
+
+#endif /* DCT_FLOAT_SUPPORTED */
+
+
 /*
  * Initialize for a processing pass.
  * Verify that all referenced Q-tables are present, and set up
@@ -56,11 +182,170 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
   my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
   int ci, qtblno, i;
   jpeg_component_info *compptr;
+  int method = 0;
   JQUANT_TBL * qtbl;
   DCTELEM * dtbl;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
+    /* Select the proper DCT routine for this component's scaling */
+    switch ((compptr->DCT_h_scaled_size << 8) + compptr->DCT_v_scaled_size) {
+#ifdef DCT_SCALING_SUPPORTED
+    case ((1 << 8) + 1):
+      fdct->do_dct[ci] = jpeg_fdct_1x1;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((2 << 8) + 2):
+      fdct->do_dct[ci] = jpeg_fdct_2x2;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((3 << 8) + 3):
+      fdct->do_dct[ci] = jpeg_fdct_3x3;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((4 << 8) + 4):
+      fdct->do_dct[ci] = jpeg_fdct_4x4;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((5 << 8) + 5):
+      fdct->do_dct[ci] = jpeg_fdct_5x5;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((6 << 8) + 6):
+      fdct->do_dct[ci] = jpeg_fdct_6x6;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((7 << 8) + 7):
+      fdct->do_dct[ci] = jpeg_fdct_7x7;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((9 << 8) + 9):
+      fdct->do_dct[ci] = jpeg_fdct_9x9;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((10 << 8) + 10):
+      fdct->do_dct[ci] = jpeg_fdct_10x10;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((11 << 8) + 11):
+      fdct->do_dct[ci] = jpeg_fdct_11x11;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((12 << 8) + 12):
+      fdct->do_dct[ci] = jpeg_fdct_12x12;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((13 << 8) + 13):
+      fdct->do_dct[ci] = jpeg_fdct_13x13;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((14 << 8) + 14):
+      fdct->do_dct[ci] = jpeg_fdct_14x14;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((15 << 8) + 15):
+      fdct->do_dct[ci] = jpeg_fdct_15x15;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((16 << 8) + 16):
+      fdct->do_dct[ci] = jpeg_fdct_16x16;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((16 << 8) + 8):
+      fdct->do_dct[ci] = jpeg_fdct_16x8;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((14 << 8) + 7):
+      fdct->do_dct[ci] = jpeg_fdct_14x7;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((12 << 8) + 6):
+      fdct->do_dct[ci] = jpeg_fdct_12x6;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((10 << 8) + 5):
+      fdct->do_dct[ci] = jpeg_fdct_10x5;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((8 << 8) + 4):
+      fdct->do_dct[ci] = jpeg_fdct_8x4;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((6 << 8) + 3):
+      fdct->do_dct[ci] = jpeg_fdct_6x3;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((4 << 8) + 2):
+      fdct->do_dct[ci] = jpeg_fdct_4x2;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((2 << 8) + 1):
+      fdct->do_dct[ci] = jpeg_fdct_2x1;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((8 << 8) + 16):
+      fdct->do_dct[ci] = jpeg_fdct_8x16;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((7 << 8) + 14):
+      fdct->do_dct[ci] = jpeg_fdct_7x14;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((6 << 8) + 12):
+      fdct->do_dct[ci] = jpeg_fdct_6x12;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((5 << 8) + 10):
+      fdct->do_dct[ci] = jpeg_fdct_5x10;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((4 << 8) + 8):
+      fdct->do_dct[ci] = jpeg_fdct_4x8;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((3 << 8) + 6):
+      fdct->do_dct[ci] = jpeg_fdct_3x6;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((2 << 8) + 4):
+      fdct->do_dct[ci] = jpeg_fdct_2x4;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((1 << 8) + 2):
+      fdct->do_dct[ci] = jpeg_fdct_1x2;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+#endif
+    case ((DCTSIZE << 8) + DCTSIZE):
+      switch (cinfo->dct_method) {
+#ifdef DCT_ISLOW_SUPPORTED
+      case JDCT_ISLOW:
+	fdct->do_dct[ci] = jpeg_fdct_islow;
+	method = JDCT_ISLOW;
+	break;
+#endif
+#ifdef DCT_IFAST_SUPPORTED
+      case JDCT_IFAST:
+	fdct->do_dct[ci] = jpeg_fdct_ifast;
+	method = JDCT_IFAST;
+	break;
+#endif
+#ifdef DCT_FLOAT_SUPPORTED
+      case JDCT_FLOAT:
+	fdct->do_float_dct[ci] = jpeg_fdct_float;
+	method = JDCT_FLOAT;
+	break;
+#endif
+      default:
+	ERREXIT(cinfo, JERR_NOT_COMPILED);
+	break;
+      }
+      break;
+    default:
+      ERREXIT2(cinfo, JERR_BAD_DCTSIZE,
+	       compptr->DCT_h_scaled_size, compptr->DCT_v_scaled_size);
+      break;
+    }
     qtblno = compptr->quant_tbl_no;
     /* Make sure specified quantization table is present */
     if (qtblno < 0 || qtblno >= NUM_QUANT_TBLS ||
@@ -69,8 +354,8 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
     qtbl = cinfo->quant_tbl_ptrs[qtblno];
     /* Compute divisors for this quant table */
     /* We may do this more than once for same table, but it's not a big deal */
-    switch (cinfo->dct_method) {
-#ifdef DCT_ISLOW_SUPPORTED
+    switch (method) {
+#ifdef PROVIDE_ISLOW_TABLES
     case JDCT_ISLOW:
       /* For LL&M IDCT method, divisors are equal to raw quantization
        * coefficients multiplied by 8 (to counteract scaling).
@@ -84,6 +369,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
       for (i = 0; i < DCTSIZE2; i++) {
 	dtbl[i] = ((DCTELEM) qtbl->quantval[i]) << 3;
       }
+      fdct->pub.forward_DCT[ci] = forward_DCT;
       break;
 #endif
 #ifdef DCT_IFAST_SUPPORTED
@@ -122,6 +408,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
 		    CONST_BITS-3);
 	}
       }
+      fdct->pub.forward_DCT[ci] = forward_DCT;
       break;
 #endif
 #ifdef DCT_FLOAT_SUPPORTED
@@ -158,6 +445,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
 	  }
 	}
       }
+      fdct->pub.forward_DCT[ci] = forward_DCT_float;
       break;
 #endif
     default:
@@ -168,175 +456,6 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
 }
 
 
-/*
- * Perform forward DCT on one or more blocks of a component.
- *
- * The input samples are taken from the sample_data[] array starting at
- * position start_row/start_col, and moving to the right for any additional
- * blocks. The quantized coefficients are returned in coef_blocks[].
- */
-
-METHODDEF(void)
-forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr,
-	     JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
-	     JDIMENSION start_row, JDIMENSION start_col,
-	     JDIMENSION num_blocks)
-/* This version is used for integer DCT implementations. */
-{
-  /* This routine is heavily used, so it's worth coding it tightly. */
-  my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
-  forward_DCT_method_ptr do_dct = fdct->do_dct;
-  DCTELEM * divisors = fdct->divisors[compptr->quant_tbl_no];
-  DCTELEM workspace[DCTSIZE2];	/* work area for FDCT subroutine */
-  JDIMENSION bi;
-
-  sample_data += start_row;	/* fold in the vertical offset once */
-
-  for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
-    /* Load data into workspace, applying unsigned->signed conversion */
-    { register DCTELEM *workspaceptr;
-      register JSAMPROW elemptr;
-      register int elemr;
-
-      workspaceptr = workspace;
-      for (elemr = 0; elemr < DCTSIZE; elemr++) {
-	elemptr = sample_data[elemr] + start_col;
-#if DCTSIZE == 8		/* unroll the inner loop */
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-#else
-	{ register int elemc;
-	  for (elemc = DCTSIZE; elemc > 0; elemc--) {
-	    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	  }
-	}
-#endif
-      }
-    }
-
-    /* Perform the DCT */
-    (*do_dct) (workspace);
-
-    /* Quantize/descale the coefficients, and store into coef_blocks[] */
-    { register DCTELEM temp, qval;
-      register int i;
-      register JCOEFPTR output_ptr = coef_blocks[bi];
-
-      for (i = 0; i < DCTSIZE2; i++) {
-	qval = divisors[i];
-	temp = workspace[i];
-	/* Divide the coefficient value by qval, ensuring proper rounding.
-	 * Since C does not specify the direction of rounding for negative
-	 * quotients, we have to force the dividend positive for portability.
-	 *
-	 * In most files, at least half of the output values will be zero
-	 * (at default quantization settings, more like three-quarters...)
-	 * so we should ensure that this case is fast.  On many machines,
-	 * a comparison is enough cheaper than a divide to make a special test
-	 * a win.  Since both inputs will be nonnegative, we need only test
-	 * for a < b to discover whether a/b is 0.
-	 * If your machine's division is fast enough, define FAST_DIVIDE.
-	 */
-#ifdef FAST_DIVIDE
-#define DIVIDE_BY(a,b)	a /= b
-#else
-#define DIVIDE_BY(a,b)	if (a >= b) a /= b; else a = 0
-#endif
-	if (temp < 0) {
-	  temp = -temp;
-	  temp += qval>>1;	/* for rounding */
-	  DIVIDE_BY(temp, qval);
-	  temp = -temp;
-	} else {
-	  temp += qval>>1;	/* for rounding */
-	  DIVIDE_BY(temp, qval);
-	}
-	output_ptr[i] = (JCOEF) temp;
-      }
-    }
-  }
-}
-
-
-#ifdef DCT_FLOAT_SUPPORTED
-
-METHODDEF(void)
-forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,
-		   JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
-		   JDIMENSION start_row, JDIMENSION start_col,
-		   JDIMENSION num_blocks)
-/* This version is used for floating-point DCT implementations. */
-{
-  /* This routine is heavily used, so it's worth coding it tightly. */
-  my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
-  float_DCT_method_ptr do_dct = fdct->do_float_dct;
-  FAST_FLOAT * divisors = fdct->float_divisors[compptr->quant_tbl_no];
-  FAST_FLOAT workspace[DCTSIZE2]; /* work area for FDCT subroutine */
-  JDIMENSION bi;
-
-  sample_data += start_row;	/* fold in the vertical offset once */
-
-  for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
-    /* Load data into workspace, applying unsigned->signed conversion */
-    { register FAST_FLOAT *workspaceptr;
-      register JSAMPROW elemptr;
-      register int elemr;
-
-      workspaceptr = workspace;
-      for (elemr = 0; elemr < DCTSIZE; elemr++) {
-	elemptr = sample_data[elemr] + start_col;
-#if DCTSIZE == 8		/* unroll the inner loop */
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-#else
-	{ register int elemc;
-	  for (elemc = DCTSIZE; elemc > 0; elemc--) {
-	    *workspaceptr++ = (FAST_FLOAT)
-	      (GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	  }
-	}
-#endif
-      }
-    }
-
-    /* Perform the DCT */
-    (*do_dct) (workspace);
-
-    /* Quantize/descale the coefficients, and store into coef_blocks[] */
-    { register FAST_FLOAT temp;
-      register int i;
-      register JCOEFPTR output_ptr = coef_blocks[bi];
-
-      for (i = 0; i < DCTSIZE2; i++) {
-	/* Apply the quantization and scaling factor */
-	temp = workspace[i] * divisors[i];
-	/* Round to nearest integer.
-	 * Since C does not specify the direction of rounding for negative
-	 * quotients, we have to force the dividend positive for portability.
-	 * The maximum coefficient size is +-16K (for 12-bit data), so this
-	 * code should work for either 16-bit or 32-bit ints.
-	 */
-	output_ptr[i] = (JCOEF) ((int) (temp + (FAST_FLOAT) 16384.5) - 16384);
-      }
-    }
-  }
-}
-
-#endif /* DCT_FLOAT_SUPPORTED */
-
-
 /*
  * Initialize FDCT manager.
  */
@@ -353,30 +472,6 @@ jinit_forward_dct (j_compress_ptr cinfo)
   cinfo->fdct = (struct jpeg_forward_dct *) fdct;
   fdct->pub.start_pass = start_pass_fdctmgr;
 
-  switch (cinfo->dct_method) {
-#ifdef DCT_ISLOW_SUPPORTED
-  case JDCT_ISLOW:
-    fdct->pub.forward_DCT = forward_DCT;
-    fdct->do_dct = jpeg_fdct_islow;
-    break;
-#endif
-#ifdef DCT_IFAST_SUPPORTED
-  case JDCT_IFAST:
-    fdct->pub.forward_DCT = forward_DCT;
-    fdct->do_dct = jpeg_fdct_ifast;
-    break;
-#endif
-#ifdef DCT_FLOAT_SUPPORTED
-  case JDCT_FLOAT:
-    fdct->pub.forward_DCT = forward_DCT_float;
-    fdct->do_float_dct = jpeg_fdct_float;
-    break;
-#endif
-  default:
-    ERREXIT(cinfo, JERR_NOT_COMPILED);
-    break;
-  }
-
   /* Mark divisor tables unallocated */
   for (i = 0; i < NUM_QUANT_TBLS; i++) {
     fdct->divisors[i] = NULL;
diff --git a/3rdparty/libjpeg/jchuff.c b/3rdparty/libjpeg/jchuff.c
index f235250548..257d7aa1f5 100644
--- a/3rdparty/libjpeg/jchuff.c
+++ b/3rdparty/libjpeg/jchuff.c
@@ -2,22 +2,48 @@
  * jchuff.c
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2006-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
  * This file contains Huffman entropy encoding routines.
+ * Both sequential and progressive modes are supported in this single module.
  *
  * Much of the complexity here has to do with supporting output suspension.
  * If the data destination module demands suspension, we want to be able to
  * back up to the start of the current MCU.  To do this, we copy state
  * variables into local working storage, and update them back to the
  * permanent JPEG objects only upon successful completion of an MCU.
+ *
+ * We do not support output suspension for the progressive JPEG mode, since
+ * the library currently does not allow multiple-scan files to be written
+ * with output suspension.
  */
 
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jchuff.h"		/* Declarations shared with jcphuff.c */
+
+
+/* The legal range of a DCT coefficient is
+ *  -1024 .. +1023  for 8-bit data;
+ * -16384 .. +16383 for 12-bit data.
+ * Hence the magnitude should always fit in 10 or 14 bits respectively.
+ */
+
+#if BITS_IN_JSAMPLE == 8
+#define MAX_COEF_BITS 10
+#else
+#define MAX_COEF_BITS 14
+#endif
+
+/* Derived data constructed for each Huffman table */
+
+typedef struct {
+  unsigned int ehufco[256];	/* code for each symbol */
+  char ehufsi[256];		/* length of code for each symbol */
+  /* If no code has been allocated for a symbol S, ehufsi[S] contains 0 */
+} c_derived_tbl;
 
 
 /* Expanded entropy encoder object for Huffman encoding.
@@ -65,15 +91,32 @@ typedef struct {
   c_derived_tbl * dc_derived_tbls[NUM_HUFF_TBLS];
   c_derived_tbl * ac_derived_tbls[NUM_HUFF_TBLS];
 
-#ifdef ENTROPY_OPT_SUPPORTED	/* Statistics tables for optimization */
+  /* Statistics tables for optimization */
   long * dc_count_ptrs[NUM_HUFF_TBLS];
   long * ac_count_ptrs[NUM_HUFF_TBLS];
-#endif
+
+  /* Following fields used only in progressive mode */
+
+  /* Mode flag: TRUE for optimization, FALSE for actual data output */
+  boolean gather_statistics;
+
+  /* next_output_byte/free_in_buffer are local copies of cinfo->dest fields.
+   */
+  JOCTET * next_output_byte;	/* => next byte to write in buffer */
+  size_t free_in_buffer;	/* # of byte spaces remaining in buffer */
+  j_compress_ptr cinfo;		/* link to cinfo (needed for dump_buffer) */
+
+  /* Coding status for AC components */
+  int ac_tbl_no;		/* the table number of the single component */
+  unsigned int EOBRUN;		/* run length of EOBs */
+  unsigned int BE;		/* # of buffered correction bits before MCU */
+  char * bit_buffer;		/* buffer for correction bits (1 per char) */
+  /* packing correction bits tightly would save some space but cost time... */
 } huff_entropy_encoder;
 
 typedef huff_entropy_encoder * huff_entropy_ptr;
 
-/* Working state while writing an MCU.
+/* Working state while writing an MCU (sequential mode).
  * This struct contains all the fields that are needed by subroutines.
  */
 
@@ -84,98 +127,37 @@ typedef struct {
   j_compress_ptr cinfo;		/* dump_buffer needs access to this */
 } working_state;
 
+/* MAX_CORR_BITS is the number of bits the AC refinement correction-bit
+ * buffer can hold.  Larger sizes may slightly improve compression, but
+ * 1000 is already well into the realm of overkill.
+ * The minimum safe size is 64 bits.
+ */
 
-/* Forward declarations */
-METHODDEF(boolean) encode_mcu_huff JPP((j_compress_ptr cinfo,
-					JBLOCKROW *MCU_data));
-METHODDEF(void) finish_pass_huff JPP((j_compress_ptr cinfo));
-#ifdef ENTROPY_OPT_SUPPORTED
-METHODDEF(boolean) encode_mcu_gather JPP((j_compress_ptr cinfo,
-					  JBLOCKROW *MCU_data));
-METHODDEF(void) finish_pass_gather JPP((j_compress_ptr cinfo));
-#endif
-
+#define MAX_CORR_BITS  1000	/* Max # of correction bits I can buffer */
 
-/*
- * Initialize for a Huffman-compressed scan.
- * If gather_statistics is TRUE, we do not output anything during the scan,
- * just count the Huffman symbols used and generate Huffman code tables.
+/* IRIGHT_SHIFT is like RIGHT_SHIFT, but works on int rather than INT32.
+ * We assume that int right shift is unsigned if INT32 right shift is,
+ * which should be safe.
  */
 
-METHODDEF(void)
-start_pass_huff (j_compress_ptr cinfo, boolean gather_statistics)
-{
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
-  int ci, dctbl, actbl;
-  jpeg_component_info * compptr;
-
-  if (gather_statistics) {
-#ifdef ENTROPY_OPT_SUPPORTED
-    entropy->pub.encode_mcu = encode_mcu_gather;
-    entropy->pub.finish_pass = finish_pass_gather;
+#ifdef RIGHT_SHIFT_IS_UNSIGNED
+#define ISHIFT_TEMPS	int ishift_temp;
+#define IRIGHT_SHIFT(x,shft)  \
+	((ishift_temp = (x)) < 0 ? \
+	 (ishift_temp >> (shft)) | ((~0) << (16-(shft))) : \
+	 (ishift_temp >> (shft)))
 #else
-    ERREXIT(cinfo, JERR_NOT_COMPILED);
+#define ISHIFT_TEMPS
+#define IRIGHT_SHIFT(x,shft)	((x) >> (shft))
 #endif
-  } else {
-    entropy->pub.encode_mcu = encode_mcu_huff;
-    entropy->pub.finish_pass = finish_pass_huff;
-  }
-
-  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
-    compptr = cinfo->cur_comp_info[ci];
-    dctbl = compptr->dc_tbl_no;
-    actbl = compptr->ac_tbl_no;
-    if (gather_statistics) {
-#ifdef ENTROPY_OPT_SUPPORTED
-      /* Check for invalid table indexes */
-      /* (make_c_derived_tbl does this in the other path) */
-      if (dctbl < 0 || dctbl >= NUM_HUFF_TBLS)
-	ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, dctbl);
-      if (actbl < 0 || actbl >= NUM_HUFF_TBLS)
-	ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, actbl);
-      /* Allocate and zero the statistics tables */
-      /* Note that jpeg_gen_optimal_table expects 257 entries in each table! */
-      if (entropy->dc_count_ptrs[dctbl] == NULL)
-	entropy->dc_count_ptrs[dctbl] = (long *)
-	  (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				      257 * SIZEOF(long));
-      MEMZERO(entropy->dc_count_ptrs[dctbl], 257 * SIZEOF(long));
-      if (entropy->ac_count_ptrs[actbl] == NULL)
-	entropy->ac_count_ptrs[actbl] = (long *)
-	  (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				      257 * SIZEOF(long));
-      MEMZERO(entropy->ac_count_ptrs[actbl], 257 * SIZEOF(long));
-#endif
-    } else {
-      /* Compute derived values for Huffman tables */
-      /* We may do this more than once for a table, but it's not expensive */
-      jpeg_make_c_derived_tbl(cinfo, TRUE, dctbl,
-			      & entropy->dc_derived_tbls[dctbl]);
-      jpeg_make_c_derived_tbl(cinfo, FALSE, actbl,
-			      & entropy->ac_derived_tbls[actbl]);
-    }
-    /* Initialize DC predictions to 0 */
-    entropy->saved.last_dc_val[ci] = 0;
-  }
-
-  /* Initialize bit buffer to empty */
-  entropy->saved.put_buffer = 0;
-  entropy->saved.put_bits = 0;
-
-  /* Initialize restart stuff */
-  entropy->restarts_to_go = cinfo->restart_interval;
-  entropy->next_restart_num = 0;
-}
 
 
 /*
  * Compute the derived values for a Huffman table.
  * This routine also performs some validation checks on the table.
- *
- * Note this is also used by jcphuff.c.
  */
 
-GLOBAL(void)
+LOCAL(void)
 jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno,
 			 c_derived_tbl ** pdtbl)
 {
@@ -264,18 +246,27 @@ jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno,
 }
 
 
-/* Outputting bytes to the file */
+/* Outputting bytes to the file.
+ * NB: these must be called only when actually outputting,
+ * that is, entropy->gather_statistics == FALSE.
+ */
 
 /* Emit a byte, taking 'action' if must suspend. */
-#define emit_byte(state,val,action)  \
+#define emit_byte_s(state,val,action)  \
 	{ *(state)->next_output_byte++ = (JOCTET) (val);  \
 	  if (--(state)->free_in_buffer == 0)  \
-	    if (! dump_buffer(state))  \
+	    if (! dump_buffer_s(state))  \
 	      { action; } }
 
+/* Emit a byte */
+#define emit_byte_e(entropy,val)  \
+	{ *(entropy)->next_output_byte++ = (JOCTET) (val);  \
+	  if (--(entropy)->free_in_buffer == 0)  \
+	    dump_buffer_e(entropy); }
+
 
 LOCAL(boolean)
-dump_buffer (working_state * state)
+dump_buffer_s (working_state * state)
 /* Empty the output buffer; return TRUE if successful, FALSE if must suspend */
 {
   struct jpeg_destination_mgr * dest = state->cinfo->dest;
@@ -289,6 +280,20 @@ dump_buffer (working_state * state)
 }
 
 
+LOCAL(void)
+dump_buffer_e (huff_entropy_ptr entropy)
+/* Empty the output buffer; we do not support suspension in this case. */
+{
+  struct jpeg_destination_mgr * dest = entropy->cinfo->dest;
+
+  if (! (*dest->empty_output_buffer) (entropy->cinfo))
+    ERREXIT(entropy->cinfo, JERR_CANT_SUSPEND);
+  /* After a successful buffer dump, must reset buffer pointers */
+  entropy->next_output_byte = dest->next_output_byte;
+  entropy->free_in_buffer = dest->free_in_buffer;
+}
+
+
 /* Outputting bits to the file */
 
 /* Only the right 24 bits of put_buffer are used; the valid bits are
@@ -299,7 +304,7 @@ dump_buffer (working_state * state)
 
 INLINE
 LOCAL(boolean)
-emit_bits (working_state * state, unsigned int code, int size)
+emit_bits_s (working_state * state, unsigned int code, int size)
 /* Emit some bits; return TRUE if successful, FALSE if must suspend */
 {
   /* This routine is heavily used, so it's worth coding tightly. */
@@ -321,9 +326,9 @@ emit_bits (working_state * state, unsigned int code, int size)
   while (put_bits >= 8) {
     int c = (int) ((put_buffer >> 16) & 0xFF);
     
-    emit_byte(state, c, return FALSE);
+    emit_byte_s(state, c, return FALSE);
     if (c == 0xFF) {		/* need to stuff a zero byte? */
-      emit_byte(state, 0, return FALSE);
+      emit_byte_s(state, 0, return FALSE);
     }
     put_buffer <<= 8;
     put_bits -= 8;
@@ -336,17 +341,575 @@ emit_bits (working_state * state, unsigned int code, int size)
 }
 
 
+INLINE
+LOCAL(void)
+emit_bits_e (huff_entropy_ptr entropy, unsigned int code, int size)
+/* Emit some bits, unless we are in gather mode */
+{
+  /* This routine is heavily used, so it's worth coding tightly. */
+  register INT32 put_buffer = (INT32) code;
+  register int put_bits = entropy->saved.put_bits;
+
+  /* if size is 0, caller used an invalid Huffman table entry */
+  if (size == 0)
+    ERREXIT(entropy->cinfo, JERR_HUFF_MISSING_CODE);
+
+  if (entropy->gather_statistics)
+    return;			/* do nothing if we're only getting stats */
+
+  put_buffer &= (((INT32) 1)<<size) - 1; /* mask off any extra bits in code */
+  
+  put_bits += size;		/* new number of bits in buffer */
+
+  put_buffer <<= 24 - put_bits; /* align incoming bits */
+
+  /* and merge with old buffer contents */
+  put_buffer |= entropy->saved.put_buffer;
+
+  while (put_bits >= 8) {
+    int c = (int) ((put_buffer >> 16) & 0xFF);
+
+    emit_byte_e(entropy, c);
+    if (c == 0xFF) {		/* need to stuff a zero byte? */
+      emit_byte_e(entropy, 0);
+    }
+    put_buffer <<= 8;
+    put_bits -= 8;
+  }
+
+  entropy->saved.put_buffer = put_buffer; /* update variables */
+  entropy->saved.put_bits = put_bits;
+}
+
+
 LOCAL(boolean)
-flush_bits (working_state * state)
+flush_bits_s (working_state * state)
 {
-  if (! emit_bits(state, 0x7F, 7)) /* fill any partial byte with ones */
+  if (! emit_bits_s(state, 0x7F, 7)) /* fill any partial byte with ones */
     return FALSE;
-  state->cur.put_buffer = 0;	/* and reset bit-buffer to empty */
+  state->cur.put_buffer = 0;	     /* and reset bit-buffer to empty */
   state->cur.put_bits = 0;
   return TRUE;
 }
 
 
+LOCAL(void)
+flush_bits_e (huff_entropy_ptr entropy)
+{
+  emit_bits_e(entropy, 0x7F, 7); /* fill any partial byte with ones */
+  entropy->saved.put_buffer = 0; /* and reset bit-buffer to empty */
+  entropy->saved.put_bits = 0;
+}
+
+
+/*
+ * Emit (or just count) a Huffman symbol.
+ */
+
+INLINE
+LOCAL(void)
+emit_dc_symbol (huff_entropy_ptr entropy, int tbl_no, int symbol)
+{
+  if (entropy->gather_statistics)
+    entropy->dc_count_ptrs[tbl_no][symbol]++;
+  else {
+    c_derived_tbl * tbl = entropy->dc_derived_tbls[tbl_no];
+    emit_bits_e(entropy, tbl->ehufco[symbol], tbl->ehufsi[symbol]);
+  }
+}
+
+
+INLINE
+LOCAL(void)
+emit_ac_symbol (huff_entropy_ptr entropy, int tbl_no, int symbol)
+{
+  if (entropy->gather_statistics)
+    entropy->ac_count_ptrs[tbl_no][symbol]++;
+  else {
+    c_derived_tbl * tbl = entropy->ac_derived_tbls[tbl_no];
+    emit_bits_e(entropy, tbl->ehufco[symbol], tbl->ehufsi[symbol]);
+  }
+}
+
+
+/*
+ * Emit bits from a correction bit buffer.
+ */
+
+LOCAL(void)
+emit_buffered_bits (huff_entropy_ptr entropy, char * bufstart,
+		    unsigned int nbits)
+{
+  if (entropy->gather_statistics)
+    return;			/* no real work */
+
+  while (nbits > 0) {
+    emit_bits_e(entropy, (unsigned int) (*bufstart), 1);
+    bufstart++;
+    nbits--;
+  }
+}
+
+
+/*
+ * Emit any pending EOBRUN symbol.
+ */
+
+LOCAL(void)
+emit_eobrun (huff_entropy_ptr entropy)
+{
+  register int temp, nbits;
+
+  if (entropy->EOBRUN > 0) {	/* if there is any pending EOBRUN */
+    temp = entropy->EOBRUN;
+    nbits = 0;
+    while ((temp >>= 1))
+      nbits++;
+    /* safety check: shouldn't happen given limited correction-bit buffer */
+    if (nbits > 14)
+      ERREXIT(entropy->cinfo, JERR_HUFF_MISSING_CODE);
+
+    emit_ac_symbol(entropy, entropy->ac_tbl_no, nbits << 4);
+    if (nbits)
+      emit_bits_e(entropy, entropy->EOBRUN, nbits);
+
+    entropy->EOBRUN = 0;
+
+    /* Emit any buffered correction bits */
+    emit_buffered_bits(entropy, entropy->bit_buffer, entropy->BE);
+    entropy->BE = 0;
+  }
+}
+
+
+/*
+ * Emit a restart marker & resynchronize predictions.
+ */
+
+LOCAL(boolean)
+emit_restart_s (working_state * state, int restart_num)
+{
+  int ci;
+
+  if (! flush_bits_s(state))
+    return FALSE;
+
+  emit_byte_s(state, 0xFF, return FALSE);
+  emit_byte_s(state, JPEG_RST0 + restart_num, return FALSE);
+
+  /* Re-initialize DC predictions to 0 */
+  for (ci = 0; ci < state->cinfo->comps_in_scan; ci++)
+    state->cur.last_dc_val[ci] = 0;
+
+  /* The restart counter is not updated until we successfully write the MCU. */
+
+  return TRUE;
+}
+
+
+LOCAL(void)
+emit_restart_e (huff_entropy_ptr entropy, int restart_num)
+{
+  int ci;
+
+  emit_eobrun(entropy);
+
+  if (! entropy->gather_statistics) {
+    flush_bits_e(entropy);
+    emit_byte_e(entropy, 0xFF);
+    emit_byte_e(entropy, JPEG_RST0 + restart_num);
+  }
+
+  if (entropy->cinfo->Ss == 0) {
+    /* Re-initialize DC predictions to 0 */
+    for (ci = 0; ci < entropy->cinfo->comps_in_scan; ci++)
+      entropy->saved.last_dc_val[ci] = 0;
+  } else {
+    /* Re-initialize all AC-related fields to 0 */
+    entropy->EOBRUN = 0;
+    entropy->BE = 0;
+  }
+}
+
+
+/*
+ * MCU encoding for DC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  register int temp, temp2;
+  register int nbits;
+  int blkn, ci;
+  int Al = cinfo->Al;
+  JBLOCKROW block;
+  jpeg_component_info * compptr;
+  ISHIFT_TEMPS
+
+  entropy->next_output_byte = cinfo->dest->next_output_byte;
+  entropy->free_in_buffer = cinfo->dest->free_in_buffer;
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval)
+    if (entropy->restarts_to_go == 0)
+      emit_restart_e(entropy, entropy->next_restart_num);
+
+  /* Encode the MCU data blocks */
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    block = MCU_data[blkn];
+    ci = cinfo->MCU_membership[blkn];
+    compptr = cinfo->cur_comp_info[ci];
+
+    /* Compute the DC value after the required point transform by Al.
+     * This is simply an arithmetic right shift.
+     */
+    temp2 = IRIGHT_SHIFT((int) ((*block)[0]), Al);
+
+    /* DC differences are figured on the point-transformed values. */
+    temp = temp2 - entropy->saved.last_dc_val[ci];
+    entropy->saved.last_dc_val[ci] = temp2;
+
+    /* Encode the DC coefficient difference per section G.1.2.1 */
+    temp2 = temp;
+    if (temp < 0) {
+      temp = -temp;		/* temp is abs value of input */
+      /* For a negative input, want temp2 = bitwise complement of abs(input) */
+      /* This code assumes we are on a two's complement machine */
+      temp2--;
+    }
+    
+    /* Find the number of bits needed for the magnitude of the coefficient */
+    nbits = 0;
+    while (temp) {
+      nbits++;
+      temp >>= 1;
+    }
+    /* Check for out-of-range coefficient values.
+     * Since we're encoding a difference, the range limit is twice as much.
+     */
+    if (nbits > MAX_COEF_BITS+1)
+      ERREXIT(cinfo, JERR_BAD_DCT_COEF);
+    
+    /* Count/emit the Huffman-coded symbol for the number of bits */
+    emit_dc_symbol(entropy, compptr->dc_tbl_no, nbits);
+    
+    /* Emit that number of bits of the value, if positive, */
+    /* or the complement of its magnitude, if negative. */
+    if (nbits)			/* emit_bits rejects calls with size 0 */
+      emit_bits_e(entropy, (unsigned int) temp2, nbits);
+  }
+
+  cinfo->dest->next_output_byte = entropy->next_output_byte;
+  cinfo->dest->free_in_buffer = entropy->free_in_buffer;
+
+  /* Update restart-interval state too */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU encoding for AC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  register int temp, temp2;
+  register int nbits;
+  register int r, k;
+  int Se, Al;
+  const int * natural_order;
+  JBLOCKROW block;
+
+  entropy->next_output_byte = cinfo->dest->next_output_byte;
+  entropy->free_in_buffer = cinfo->dest->free_in_buffer;
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval)
+    if (entropy->restarts_to_go == 0)
+      emit_restart_e(entropy, entropy->next_restart_num);
+
+  Se = cinfo->Se;
+  Al = cinfo->Al;
+  natural_order = cinfo->natural_order;
+
+  /* Encode the MCU data block */
+  block = MCU_data[0];
+
+  /* Encode the AC coefficients per section G.1.2.2, fig. G.3 */
+  
+  r = 0;			/* r = run length of zeros */
+   
+  for (k = cinfo->Ss; k <= Se; k++) {
+    if ((temp = (*block)[natural_order[k]]) == 0) {
+      r++;
+      continue;
+    }
+    /* We must apply the point transform by Al.  For AC coefficients this
+     * is an integer division with rounding towards 0.  To do this portably
+     * in C, we shift after obtaining the absolute value; so the code is
+     * interwoven with finding the abs value (temp) and output bits (temp2).
+     */
+    if (temp < 0) {
+      temp = -temp;		/* temp is abs value of input */
+      temp >>= Al;		/* apply the point transform */
+      /* For a negative coef, want temp2 = bitwise complement of abs(coef) */
+      temp2 = ~temp;
+    } else {
+      temp >>= Al;		/* apply the point transform */
+      temp2 = temp;
+    }
+    /* Watch out for case that nonzero coef is zero after point transform */
+    if (temp == 0) {
+      r++;
+      continue;
+    }
+
+    /* Emit any pending EOBRUN */
+    if (entropy->EOBRUN > 0)
+      emit_eobrun(entropy);
+    /* if run length > 15, must emit special run-length-16 codes (0xF0) */
+    while (r > 15) {
+      emit_ac_symbol(entropy, entropy->ac_tbl_no, 0xF0);
+      r -= 16;
+    }
+
+    /* Find the number of bits needed for the magnitude of the coefficient */
+    nbits = 1;			/* there must be at least one 1 bit */
+    while ((temp >>= 1))
+      nbits++;
+    /* Check for out-of-range coefficient values */
+    if (nbits > MAX_COEF_BITS)
+      ERREXIT(cinfo, JERR_BAD_DCT_COEF);
+
+    /* Count/emit Huffman symbol for run length / number of bits */
+    emit_ac_symbol(entropy, entropy->ac_tbl_no, (r << 4) + nbits);
+
+    /* Emit that number of bits of the value, if positive, */
+    /* or the complement of its magnitude, if negative. */
+    emit_bits_e(entropy, (unsigned int) temp2, nbits);
+
+    r = 0;			/* reset zero run length */
+  }
+
+  if (r > 0) {			/* If there are trailing zeroes, */
+    entropy->EOBRUN++;		/* count an EOB */
+    if (entropy->EOBRUN == 0x7FFF)
+      emit_eobrun(entropy);	/* force it out to avoid overflow */
+  }
+
+  cinfo->dest->next_output_byte = entropy->next_output_byte;
+  cinfo->dest->free_in_buffer = entropy->free_in_buffer;
+
+  /* Update restart-interval state too */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU encoding for DC successive approximation refinement scan.
+ * Note: we assume such scans can be multi-component, although the spec
+ * is not very clear on the point.
+ */
+
+METHODDEF(boolean)
+encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  register int temp;
+  int blkn;
+  int Al = cinfo->Al;
+  JBLOCKROW block;
+
+  entropy->next_output_byte = cinfo->dest->next_output_byte;
+  entropy->free_in_buffer = cinfo->dest->free_in_buffer;
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval)
+    if (entropy->restarts_to_go == 0)
+      emit_restart_e(entropy, entropy->next_restart_num);
+
+  /* Encode the MCU data blocks */
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    block = MCU_data[blkn];
+
+    /* We simply emit the Al'th bit of the DC coefficient value. */
+    temp = (*block)[0];
+    emit_bits_e(entropy, (unsigned int) (temp >> Al), 1);
+  }
+
+  cinfo->dest->next_output_byte = entropy->next_output_byte;
+  cinfo->dest->free_in_buffer = entropy->free_in_buffer;
+
+  /* Update restart-interval state too */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU encoding for AC successive approximation refinement scan.
+ */
+
+METHODDEF(boolean)
+encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  register int temp;
+  register int r, k;
+  int EOB;
+  char *BR_buffer;
+  unsigned int BR;
+  int Se, Al;
+  const int * natural_order;
+  JBLOCKROW block;
+  int absvalues[DCTSIZE2];
+
+  entropy->next_output_byte = cinfo->dest->next_output_byte;
+  entropy->free_in_buffer = cinfo->dest->free_in_buffer;
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval)
+    if (entropy->restarts_to_go == 0)
+      emit_restart_e(entropy, entropy->next_restart_num);
+
+  Se = cinfo->Se;
+  Al = cinfo->Al;
+  natural_order = cinfo->natural_order;
+
+  /* Encode the MCU data block */
+  block = MCU_data[0];
+
+  /* It is convenient to make a pre-pass to determine the transformed
+   * coefficients' absolute values and the EOB position.
+   */
+  EOB = 0;
+  for (k = cinfo->Ss; k <= Se; k++) {
+    temp = (*block)[natural_order[k]];
+    /* We must apply the point transform by Al.  For AC coefficients this
+     * is an integer division with rounding towards 0.  To do this portably
+     * in C, we shift after obtaining the absolute value.
+     */
+    if (temp < 0)
+      temp = -temp;		/* temp is abs value of input */
+    temp >>= Al;		/* apply the point transform */
+    absvalues[k] = temp;	/* save abs value for main pass */
+    if (temp == 1)
+      EOB = k;			/* EOB = index of last newly-nonzero coef */
+  }
+
+  /* Encode the AC coefficients per section G.1.2.3, fig. G.7 */
+  
+  r = 0;			/* r = run length of zeros */
+  BR = 0;			/* BR = count of buffered bits added now */
+  BR_buffer = entropy->bit_buffer + entropy->BE; /* Append bits to buffer */
+
+  for (k = cinfo->Ss; k <= Se; k++) {
+    if ((temp = absvalues[k]) == 0) {
+      r++;
+      continue;
+    }
+
+    /* Emit any required ZRLs, but not if they can be folded into EOB */
+    while (r > 15 && k <= EOB) {
+      /* emit any pending EOBRUN and the BE correction bits */
+      emit_eobrun(entropy);
+      /* Emit ZRL */
+      emit_ac_symbol(entropy, entropy->ac_tbl_no, 0xF0);
+      r -= 16;
+      /* Emit buffered correction bits that must be associated with ZRL */
+      emit_buffered_bits(entropy, BR_buffer, BR);
+      BR_buffer = entropy->bit_buffer; /* BE bits are gone now */
+      BR = 0;
+    }
+
+    /* If the coef was previously nonzero, it only needs a correction bit.
+     * NOTE: a straight translation of the spec's figure G.7 would suggest
+     * that we also need to test r > 15.  But if r > 15, we can only get here
+     * if k > EOB, which implies that this coefficient is not 1.
+     */
+    if (temp > 1) {
+      /* The correction bit is the next bit of the absolute value. */
+      BR_buffer[BR++] = (char) (temp & 1);
+      continue;
+    }
+
+    /* Emit any pending EOBRUN and the BE correction bits */
+    emit_eobrun(entropy);
+
+    /* Count/emit Huffman symbol for run length / number of bits */
+    emit_ac_symbol(entropy, entropy->ac_tbl_no, (r << 4) + 1);
+
+    /* Emit output bit for newly-nonzero coef */
+    temp = ((*block)[natural_order[k]] < 0) ? 0 : 1;
+    emit_bits_e(entropy, (unsigned int) temp, 1);
+
+    /* Emit buffered correction bits that must be associated with this code */
+    emit_buffered_bits(entropy, BR_buffer, BR);
+    BR_buffer = entropy->bit_buffer; /* BE bits are gone now */
+    BR = 0;
+    r = 0;			/* reset zero run length */
+  }
+
+  if (r > 0 || BR > 0) {	/* If there are trailing zeroes, */
+    entropy->EOBRUN++;		/* count an EOB */
+    entropy->BE += BR;		/* concat my correction bits to older ones */
+    /* We force out the EOB if we risk either:
+     * 1. overflow of the EOB counter;
+     * 2. overflow of the correction bit buffer during the next MCU.
+     */
+    if (entropy->EOBRUN == 0x7FFF || entropy->BE > (MAX_CORR_BITS-DCTSIZE2+1))
+      emit_eobrun(entropy);
+  }
+
+  cinfo->dest->next_output_byte = entropy->next_output_byte;
+  cinfo->dest->free_in_buffer = entropy->free_in_buffer;
+
+  /* Update restart-interval state too */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  return TRUE;
+}
+
+
 /* Encode a single block's worth of coefficients */
 
 LOCAL(boolean)
@@ -356,9 +919,11 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
   register int temp, temp2;
   register int nbits;
   register int k, r, i;
-  
+  int Se = state->cinfo->lim_Se;
+  const int * natural_order = state->cinfo->natural_order;
+
   /* Encode the DC coefficient difference per section F.1.2.1 */
-  
+
   temp = temp2 = block[0] - last_dc_val;
 
   if (temp < 0) {
@@ -367,7 +932,7 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
     /* This code assumes we are on a two's complement machine */
     temp2--;
   }
-  
+
   /* Find the number of bits needed for the magnitude of the coefficient */
   nbits = 0;
   while (temp) {
@@ -379,28 +944,28 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
    */
   if (nbits > MAX_COEF_BITS+1)
     ERREXIT(state->cinfo, JERR_BAD_DCT_COEF);
-  
+
   /* Emit the Huffman-coded symbol for the number of bits */
-  if (! emit_bits(state, dctbl->ehufco[nbits], dctbl->ehufsi[nbits]))
+  if (! emit_bits_s(state, dctbl->ehufco[nbits], dctbl->ehufsi[nbits]))
     return FALSE;
 
   /* Emit that number of bits of the value, if positive, */
   /* or the complement of its magnitude, if negative. */
   if (nbits)			/* emit_bits rejects calls with size 0 */
-    if (! emit_bits(state, (unsigned int) temp2, nbits))
+    if (! emit_bits_s(state, (unsigned int) temp2, nbits))
       return FALSE;
 
   /* Encode the AC coefficients per section F.1.2.2 */
-  
+
   r = 0;			/* r = run length of zeros */
-  
-  for (k = 1; k < DCTSIZE2; k++) {
-    if ((temp = block[jpeg_natural_order[k]]) == 0) {
+
+  for (k = 1; k <= Se; k++) {
+    if ((temp = block[natural_order[k]]) == 0) {
       r++;
     } else {
       /* if run length > 15, must emit special run-length-16 codes (0xF0) */
       while (r > 15) {
-	if (! emit_bits(state, actbl->ehufco[0xF0], actbl->ehufsi[0xF0]))
+	if (! emit_bits_s(state, actbl->ehufco[0xF0], actbl->ehufsi[0xF0]))
 	  return FALSE;
 	r -= 16;
       }
@@ -411,7 +976,7 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
 	/* This code assumes we are on a two's complement machine */
 	temp2--;
       }
-      
+
       /* Find the number of bits needed for the magnitude of the coefficient */
       nbits = 1;		/* there must be at least one 1 bit */
       while ((temp >>= 1))
@@ -419,55 +984,30 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
       /* Check for out-of-range coefficient values */
       if (nbits > MAX_COEF_BITS)
 	ERREXIT(state->cinfo, JERR_BAD_DCT_COEF);
-      
+
       /* Emit Huffman symbol for run length / number of bits */
       i = (r << 4) + nbits;
-      if (! emit_bits(state, actbl->ehufco[i], actbl->ehufsi[i]))
+      if (! emit_bits_s(state, actbl->ehufco[i], actbl->ehufsi[i]))
 	return FALSE;
 
       /* Emit that number of bits of the value, if positive, */
       /* or the complement of its magnitude, if negative. */
-      if (! emit_bits(state, (unsigned int) temp2, nbits))
+      if (! emit_bits_s(state, (unsigned int) temp2, nbits))
 	return FALSE;
-      
+
       r = 0;
     }
   }
 
   /* If the last coef(s) were zero, emit an end-of-block code */
   if (r > 0)
-    if (! emit_bits(state, actbl->ehufco[0], actbl->ehufsi[0]))
+    if (! emit_bits_s(state, actbl->ehufco[0], actbl->ehufsi[0]))
       return FALSE;
 
   return TRUE;
 }
 
 
-/*
- * Emit a restart marker & resynchronize predictions.
- */
-
-LOCAL(boolean)
-emit_restart (working_state * state, int restart_num)
-{
-  int ci;
-
-  if (! flush_bits(state))
-    return FALSE;
-
-  emit_byte(state, 0xFF, return FALSE);
-  emit_byte(state, JPEG_RST0 + restart_num, return FALSE);
-
-  /* Re-initialize DC predictions to 0 */
-  for (ci = 0; ci < state->cinfo->comps_in_scan; ci++)
-    state->cur.last_dc_val[ci] = 0;
-
-  /* The restart counter is not updated until we successfully write the MCU. */
-
-  return TRUE;
-}
-
-
 /*
  * Encode and output one MCU's worth of Huffman-compressed coefficients.
  */
@@ -489,7 +1029,7 @@ encode_mcu_huff (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
   /* Emit restart marker if needed */
   if (cinfo->restart_interval) {
     if (entropy->restarts_to_go == 0)
-      if (! emit_restart(&state, entropy->next_restart_num))
+      if (! emit_restart_s(&state, entropy->next_restart_num))
 	return FALSE;
   }
 
@@ -535,20 +1075,32 @@ finish_pass_huff (j_compress_ptr cinfo)
   huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
   working_state state;
 
-  /* Load up working state ... flush_bits needs it */
-  state.next_output_byte = cinfo->dest->next_output_byte;
-  state.free_in_buffer = cinfo->dest->free_in_buffer;
-  ASSIGN_STATE(state.cur, entropy->saved);
-  state.cinfo = cinfo;
+  if (cinfo->progressive_mode) {
+    entropy->next_output_byte = cinfo->dest->next_output_byte;
+    entropy->free_in_buffer = cinfo->dest->free_in_buffer;
 
-  /* Flush out the last data */
-  if (! flush_bits(&state))
-    ERREXIT(cinfo, JERR_CANT_SUSPEND);
+    /* Flush out any buffered data */
+    emit_eobrun(entropy);
+    flush_bits_e(entropy);
 
-  /* Update state */
-  cinfo->dest->next_output_byte = state.next_output_byte;
-  cinfo->dest->free_in_buffer = state.free_in_buffer;
-  ASSIGN_STATE(entropy->saved, state.cur);
+    cinfo->dest->next_output_byte = entropy->next_output_byte;
+    cinfo->dest->free_in_buffer = entropy->free_in_buffer;
+  } else {
+    /* Load up working state ... flush_bits needs it */
+    state.next_output_byte = cinfo->dest->next_output_byte;
+    state.free_in_buffer = cinfo->dest->free_in_buffer;
+    ASSIGN_STATE(state.cur, entropy->saved);
+    state.cinfo = cinfo;
+
+    /* Flush out the last data */
+    if (! flush_bits_s(&state))
+      ERREXIT(cinfo, JERR_CANT_SUSPEND);
+
+    /* Update state */
+    cinfo->dest->next_output_byte = state.next_output_byte;
+    cinfo->dest->free_in_buffer = state.free_in_buffer;
+    ASSIGN_STATE(entropy->saved, state.cur);
+  }
 }
 
 
@@ -563,8 +1115,6 @@ finish_pass_huff (j_compress_ptr cinfo)
  * the compressed data.
  */
 
-#ifdef ENTROPY_OPT_SUPPORTED
-
 
 /* Process a single block's worth of coefficients */
 
@@ -575,6 +1125,8 @@ htest_one_block (j_compress_ptr cinfo, JCOEFPTR block, int last_dc_val,
   register int temp;
   register int nbits;
   register int k, r;
+  int Se = cinfo->lim_Se;
+  const int * natural_order = cinfo->natural_order;
   
   /* Encode the DC coefficient difference per section F.1.2.1 */
   
@@ -601,8 +1153,8 @@ htest_one_block (j_compress_ptr cinfo, JCOEFPTR block, int last_dc_val,
   
   r = 0;			/* r = run length of zeros */
   
-  for (k = 1; k < DCTSIZE2; k++) {
-    if ((temp = block[jpeg_natural_order[k]]) == 0) {
+  for (k = 1; k <= Se; k++) {
+    if ((temp = block[natural_order[k]]) == 0) {
       r++;
     } else {
       /* if run length > 15, must emit special run-length-16 codes (0xF0) */
@@ -675,7 +1227,6 @@ encode_mcu_gather (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 
 /*
  * Generate the best Huffman code table for the given counts, fill htbl.
- * Note this is also used by jcphuff.c.
  *
  * The JPEG standard requires that no symbol be assigned a codeword of all
  * one bits (so that padding bits added at the end of a compressed segment
@@ -701,7 +1252,7 @@ encode_mcu_gather (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
  * So the extra complexity of an optimal algorithm doesn't seem worthwhile.
  */
 
-GLOBAL(void)
+LOCAL(void)
 jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL * htbl, long freq[])
 {
 #define MAX_CLEN 32		/* assumed maximum initial code length */
@@ -846,7 +1397,7 @@ METHODDEF(void)
 finish_pass_gather (j_compress_ptr cinfo)
 {
   huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
-  int ci, dctbl, actbl;
+  int ci, tbl;
   jpeg_component_info * compptr;
   JHUFF_TBL **htblptr;
   boolean did_dc[NUM_HUFF_TBLS];
@@ -855,32 +1406,147 @@ finish_pass_gather (j_compress_ptr cinfo)
   /* It's important not to apply jpeg_gen_optimal_table more than once
    * per table, because it clobbers the input frequency counts!
    */
+  if (cinfo->progressive_mode)
+    /* Flush out buffered data (all we care about is counting the EOB symbol) */
+    emit_eobrun(entropy);
+
   MEMZERO(did_dc, SIZEOF(did_dc));
   MEMZERO(did_ac, SIZEOF(did_ac));
 
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     compptr = cinfo->cur_comp_info[ci];
-    dctbl = compptr->dc_tbl_no;
-    actbl = compptr->ac_tbl_no;
-    if (! did_dc[dctbl]) {
-      htblptr = & cinfo->dc_huff_tbl_ptrs[dctbl];
-      if (*htblptr == NULL)
-	*htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
-      jpeg_gen_optimal_table(cinfo, *htblptr, entropy->dc_count_ptrs[dctbl]);
-      did_dc[dctbl] = TRUE;
-    }
-    if (! did_ac[actbl]) {
-      htblptr = & cinfo->ac_huff_tbl_ptrs[actbl];
-      if (*htblptr == NULL)
-	*htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
-      jpeg_gen_optimal_table(cinfo, *htblptr, entropy->ac_count_ptrs[actbl]);
-      did_ac[actbl] = TRUE;
+    /* DC needs no table for refinement scan */
+    if (cinfo->Ss == 0 && cinfo->Ah == 0) {
+      tbl = compptr->dc_tbl_no;
+      if (! did_dc[tbl]) {
+	htblptr = & cinfo->dc_huff_tbl_ptrs[tbl];
+	if (*htblptr == NULL)
+	  *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
+	jpeg_gen_optimal_table(cinfo, *htblptr, entropy->dc_count_ptrs[tbl]);
+	did_dc[tbl] = TRUE;
+      }
+    }
+    /* AC needs no table when not present */
+    if (cinfo->Se) {
+      tbl = compptr->ac_tbl_no;
+      if (! did_ac[tbl]) {
+	htblptr = & cinfo->ac_huff_tbl_ptrs[tbl];
+	if (*htblptr == NULL)
+	  *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
+	jpeg_gen_optimal_table(cinfo, *htblptr, entropy->ac_count_ptrs[tbl]);
+	did_ac[tbl] = TRUE;
+      }
     }
   }
 }
 
 
-#endif /* ENTROPY_OPT_SUPPORTED */
+/*
+ * Initialize for a Huffman-compressed scan.
+ * If gather_statistics is TRUE, we do not output anything during the scan,
+ * just count the Huffman symbols used and generate Huffman code tables.
+ */
+
+METHODDEF(void)
+start_pass_huff (j_compress_ptr cinfo, boolean gather_statistics)
+{
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  int ci, tbl;
+  jpeg_component_info * compptr;
+
+  if (gather_statistics)
+    entropy->pub.finish_pass = finish_pass_gather;
+  else
+    entropy->pub.finish_pass = finish_pass_huff;
+
+  if (cinfo->progressive_mode) {
+    entropy->cinfo = cinfo;
+    entropy->gather_statistics = gather_statistics;
+
+    /* We assume jcmaster.c already validated the scan parameters. */
+
+    /* Select execution routine */
+    if (cinfo->Ah == 0) {
+      if (cinfo->Ss == 0)
+	entropy->pub.encode_mcu = encode_mcu_DC_first;
+      else
+	entropy->pub.encode_mcu = encode_mcu_AC_first;
+    } else {
+      if (cinfo->Ss == 0)
+	entropy->pub.encode_mcu = encode_mcu_DC_refine;
+      else {
+	entropy->pub.encode_mcu = encode_mcu_AC_refine;
+	/* AC refinement needs a correction bit buffer */
+	if (entropy->bit_buffer == NULL)
+	  entropy->bit_buffer = (char *)
+	    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+					MAX_CORR_BITS * SIZEOF(char));
+      }
+    }
+
+    /* Initialize AC stuff */
+    entropy->ac_tbl_no = cinfo->cur_comp_info[0]->ac_tbl_no;
+    entropy->EOBRUN = 0;
+    entropy->BE = 0;
+  } else {
+    if (gather_statistics)
+      entropy->pub.encode_mcu = encode_mcu_gather;
+    else
+      entropy->pub.encode_mcu = encode_mcu_huff;
+  }
+
+  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+    compptr = cinfo->cur_comp_info[ci];
+    /* DC needs no table for refinement scan */
+    if (cinfo->Ss == 0 && cinfo->Ah == 0) {
+      tbl = compptr->dc_tbl_no;
+      if (gather_statistics) {
+	/* Check for invalid table index */
+	/* (make_c_derived_tbl does this in the other path) */
+	if (tbl < 0 || tbl >= NUM_HUFF_TBLS)
+	  ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, tbl);
+	/* Allocate and zero the statistics tables */
+	/* Note that jpeg_gen_optimal_table expects 257 entries in each table! */
+	if (entropy->dc_count_ptrs[tbl] == NULL)
+	  entropy->dc_count_ptrs[tbl] = (long *)
+	    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+					257 * SIZEOF(long));
+	MEMZERO(entropy->dc_count_ptrs[tbl], 257 * SIZEOF(long));
+      } else {
+	/* Compute derived values for Huffman tables */
+	/* We may do this more than once for a table, but it's not expensive */
+	jpeg_make_c_derived_tbl(cinfo, TRUE, tbl,
+				& entropy->dc_derived_tbls[tbl]);
+      }
+      /* Initialize DC predictions to 0 */
+      entropy->saved.last_dc_val[ci] = 0;
+    }
+    /* AC needs no table when not present */
+    if (cinfo->Se) {
+      tbl = compptr->ac_tbl_no;
+      if (gather_statistics) {
+	if (tbl < 0 || tbl >= NUM_HUFF_TBLS)
+	  ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, tbl);
+	if (entropy->ac_count_ptrs[tbl] == NULL)
+	  entropy->ac_count_ptrs[tbl] = (long *)
+	    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+					257 * SIZEOF(long));
+	MEMZERO(entropy->ac_count_ptrs[tbl], 257 * SIZEOF(long));
+      } else {
+	jpeg_make_c_derived_tbl(cinfo, FALSE, tbl,
+				& entropy->ac_derived_tbls[tbl]);
+      }
+    }
+  }
+
+  /* Initialize bit buffer to empty */
+  entropy->saved.put_buffer = 0;
+  entropy->saved.put_bits = 0;
+
+  /* Initialize restart stuff */
+  entropy->restarts_to_go = cinfo->restart_interval;
+  entropy->next_restart_num = 0;
+}
 
 
 /*
@@ -902,8 +1568,9 @@ jinit_huff_encoder (j_compress_ptr cinfo)
   /* Mark tables unallocated */
   for (i = 0; i < NUM_HUFF_TBLS; i++) {
     entropy->dc_derived_tbls[i] = entropy->ac_derived_tbls[i] = NULL;
-#ifdef ENTROPY_OPT_SUPPORTED
     entropy->dc_count_ptrs[i] = entropy->ac_count_ptrs[i] = NULL;
-#endif
   }
+
+  if (cinfo->progressive_mode)
+    entropy->bit_buffer = NULL;	/* needed only in AC refinement scan */
 }
diff --git a/3rdparty/libjpeg/jchuff.h b/3rdparty/libjpeg/jchuff.h
deleted file mode 100644
index a9599fc1e6..0000000000
--- a/3rdparty/libjpeg/jchuff.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * jchuff.h
- *
- * Copyright (C) 1991-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file contains declarations for Huffman entropy encoding routines
- * that are shared between the sequential encoder (jchuff.c) and the
- * progressive encoder (jcphuff.c).  No other modules need to see these.
- */
-
-/* The legal range of a DCT coefficient is
- *  -1024 .. +1023  for 8-bit data;
- * -16384 .. +16383 for 12-bit data.
- * Hence the magnitude should always fit in 10 or 14 bits respectively.
- */
-
-#if BITS_IN_JSAMPLE == 8
-#define MAX_COEF_BITS 10
-#else
-#define MAX_COEF_BITS 14
-#endif
-
-/* Derived data constructed for each Huffman table */
-
-typedef struct {
-  unsigned int ehufco[256];	/* code for each symbol */
-  char ehufsi[256];		/* length of code for each symbol */
-  /* If no code has been allocated for a symbol S, ehufsi[S] contains 0 */
-} c_derived_tbl;
-
-/* Short forms of external names for systems with brain-damaged linkers. */
-
-#ifdef NEED_SHORT_EXTERNAL_NAMES
-#define jpeg_make_c_derived_tbl	jMkCDerived
-#define jpeg_gen_optimal_table	jGenOptTbl
-#endif /* NEED_SHORT_EXTERNAL_NAMES */
-
-/* Expand a Huffman table definition into the derived format */
-EXTERN(void) jpeg_make_c_derived_tbl
-	JPP((j_compress_ptr cinfo, boolean isDC, int tblno,
-	     c_derived_tbl ** pdtbl));
-
-/* Generate an optimal table definition given the specified counts */
-EXTERN(void) jpeg_gen_optimal_table
-	JPP((j_compress_ptr cinfo, JHUFF_TBL * htbl, long freq[]));
diff --git a/3rdparty/libjpeg/jcinit.c b/3rdparty/libjpeg/jcinit.c
index 5efffe3316..0ba310f217 100644
--- a/3rdparty/libjpeg/jcinit.c
+++ b/3rdparty/libjpeg/jcinit.c
@@ -41,17 +41,10 @@ jinit_compress_master (j_compress_ptr cinfo)
   /* Forward DCT */
   jinit_forward_dct(cinfo);
   /* Entropy encoding: either Huffman or arithmetic coding. */
-  if (cinfo->arith_code) {
-    ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
-  } else {
-    if (cinfo->progressive_mode) {
-#ifdef C_PROGRESSIVE_SUPPORTED
-      jinit_phuff_encoder(cinfo);
-#else
-      ERREXIT(cinfo, JERR_NOT_COMPILED);
-#endif
-    } else
-      jinit_huff_encoder(cinfo);
+  if (cinfo->arith_code)
+    jinit_arith_encoder(cinfo);
+  else {
+    jinit_huff_encoder(cinfo);
   }
 
   /* Need a full-image coefficient buffer in any multi-pass mode. */
diff --git a/3rdparty/libjpeg/jcmainct.c b/3rdparty/libjpeg/jcmainct.c
index 96b96cf082..7de75d1675 100644
--- a/3rdparty/libjpeg/jcmainct.c
+++ b/3rdparty/libjpeg/jcmainct.c
@@ -5,8 +5,8 @@
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
- * This file contains the main_ptr buffer controller for compression.
- * The main_ptr buffer lies between the pre-processor and the JPEG
+ * This file contains the main buffer controller for compression.
+ * The main buffer lies between the pre-processor and the JPEG
  * compressor proper; it holds downsampled data in the JPEG colorspace.
  */
 
@@ -68,32 +68,32 @@ METHODDEF(void) process_data_buffer_main
 METHODDEF(void)
 start_pass_main (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main = (my_main_ptr) cinfo->main;
 
   /* Do nothing in raw-data mode. */
   if (cinfo->raw_data_in)
     return;
 
-  main_ptr->cur_iMCU_row = 0;	/* initialize counters */
-  main_ptr->rowgroup_ctr = 0;
-  main_ptr->suspended = FALSE;
-  main_ptr->pass_mode = pass_mode;	/* save mode for use by process_data */
+  main->cur_iMCU_row = 0;	/* initialize counters */
+  main->rowgroup_ctr = 0;
+  main->suspended = FALSE;
+  main->pass_mode = pass_mode;	/* save mode for use by process_data */
 
   switch (pass_mode) {
   case JBUF_PASS_THRU:
 #ifdef FULL_MAIN_BUFFER_SUPPORTED
-    if (main_ptr->whole_image[0] != NULL)
+    if (main->whole_image[0] != NULL)
       ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
 #endif
-    main_ptr->pub.process_data = process_data_simple_main;
+    main->pub.process_data = process_data_simple_main;
     break;
 #ifdef FULL_MAIN_BUFFER_SUPPORTED
   case JBUF_SAVE_SOURCE:
   case JBUF_CRANK_DEST:
   case JBUF_SAVE_AND_PASS:
-    if (main_ptr->whole_image[0] == NULL)
+    if (main->whole_image[0] == NULL)
       ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
-    main_ptr->pub.process_data = process_data_buffer_main;
+    main->pub.process_data = process_data_buffer_main;
     break;
 #endif
   default:
@@ -114,46 +114,46 @@ process_data_simple_main (j_compress_ptr cinfo,
 			  JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
 			  JDIMENSION in_rows_avail)
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main = (my_main_ptr) cinfo->main;
 
-  while (main_ptr->cur_iMCU_row < cinfo->total_iMCU_rows) {
-    /* Read input data if we haven't filled the main_ptr buffer yet */
-    if (main_ptr->rowgroup_ctr < DCTSIZE)
+  while (main->cur_iMCU_row < cinfo->total_iMCU_rows) {
+    /* Read input data if we haven't filled the main buffer yet */
+    if (main->rowgroup_ctr < (JDIMENSION) cinfo->min_DCT_v_scaled_size)
       (*cinfo->prep->pre_process_data) (cinfo,
 					input_buf, in_row_ctr, in_rows_avail,
-					main_ptr->buffer, &main_ptr->rowgroup_ctr,
-					(JDIMENSION) DCTSIZE);
+					main->buffer, &main->rowgroup_ctr,
+					(JDIMENSION) cinfo->min_DCT_v_scaled_size);
 
     /* If we don't have a full iMCU row buffered, return to application for
      * more data.  Note that preprocessor will always pad to fill the iMCU row
      * at the bottom of the image.
      */
-    if (main_ptr->rowgroup_ctr != DCTSIZE)
+    if (main->rowgroup_ctr != (JDIMENSION) cinfo->min_DCT_v_scaled_size)
       return;
 
     /* Send the completed row to the compressor */
-    if (! (*cinfo->coef->compress_data) (cinfo, main_ptr->buffer)) {
+    if (! (*cinfo->coef->compress_data) (cinfo, main->buffer)) {
       /* If compressor did not consume the whole row, then we must need to
        * suspend processing and return to the application.  In this situation
        * we pretend we didn't yet consume the last input row; otherwise, if
        * it happened to be the last row of the image, the application would
        * think we were done.
        */
-      if (! main_ptr->suspended) {
+      if (! main->suspended) {
 	(*in_row_ctr)--;
-	main_ptr->suspended = TRUE;
+	main->suspended = TRUE;
       }
       return;
     }
     /* We did finish the row.  Undo our little suspension hack if a previous
-     * call suspended; then mark the main_ptr buffer empty.
+     * call suspended; then mark the main buffer empty.
      */
-    if (main_ptr->suspended) {
+    if (main->suspended) {
       (*in_row_ctr)++;
-      main_ptr->suspended = FALSE;
+      main->suspended = FALSE;
     }
-    main_ptr->rowgroup_ctr = 0;
-    main_ptr->cur_iMCU_row++;
+    main->rowgroup_ctr = 0;
+    main->cur_iMCU_row++;
   }
 }
 
@@ -170,25 +170,25 @@ process_data_buffer_main (j_compress_ptr cinfo,
 			  JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
 			  JDIMENSION in_rows_avail)
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main = (my_main_ptr) cinfo->main;
   int ci;
   jpeg_component_info *compptr;
-  boolean writing = (main_ptr->pass_mode != JBUF_CRANK_DEST);
+  boolean writing = (main->pass_mode != JBUF_CRANK_DEST);
 
-  while (main_ptr->cur_iMCU_row < cinfo->total_iMCU_rows) {
+  while (main->cur_iMCU_row < cinfo->total_iMCU_rows) {
     /* Realign the virtual buffers if at the start of an iMCU row. */
-    if (main_ptr->rowgroup_ctr == 0) {
+    if (main->rowgroup_ctr == 0) {
       for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
 	   ci++, compptr++) {
-	main_ptr->buffer[ci] = (*cinfo->mem->access_virt_sarray)
-	  ((j_common_ptr) cinfo, main_ptr->whole_image[ci],
-	   main_ptr->cur_iMCU_row * (compptr->v_samp_factor * DCTSIZE),
+	main->buffer[ci] = (*cinfo->mem->access_virt_sarray)
+	  ((j_common_ptr) cinfo, main->whole_image[ci],
+	   main->cur_iMCU_row * (compptr->v_samp_factor * DCTSIZE),
 	   (JDIMENSION) (compptr->v_samp_factor * DCTSIZE), writing);
       }
       /* In a read pass, pretend we just read some source data. */
       if (! writing) {
 	*in_row_ctr += cinfo->max_v_samp_factor * DCTSIZE;
-	main_ptr->rowgroup_ctr = DCTSIZE;
+	main->rowgroup_ctr = DCTSIZE;
       }
     }
 
@@ -197,40 +197,40 @@ process_data_buffer_main (j_compress_ptr cinfo,
     if (writing) {
       (*cinfo->prep->pre_process_data) (cinfo,
 					input_buf, in_row_ctr, in_rows_avail,
-					main_ptr->buffer, &main_ptr->rowgroup_ctr,
+					main->buffer, &main->rowgroup_ctr,
 					(JDIMENSION) DCTSIZE);
       /* Return to application if we need more data to fill the iMCU row. */
-      if (main_ptr->rowgroup_ctr < DCTSIZE)
+      if (main->rowgroup_ctr < DCTSIZE)
 	return;
     }
 
     /* Emit data, unless this is a sink-only pass. */
-    if (main_ptr->pass_mode != JBUF_SAVE_SOURCE) {
-      if (! (*cinfo->coef->compress_data) (cinfo, main_ptr->buffer)) {
+    if (main->pass_mode != JBUF_SAVE_SOURCE) {
+      if (! (*cinfo->coef->compress_data) (cinfo, main->buffer)) {
 	/* If compressor did not consume the whole row, then we must need to
 	 * suspend processing and return to the application.  In this situation
 	 * we pretend we didn't yet consume the last input row; otherwise, if
 	 * it happened to be the last row of the image, the application would
 	 * think we were done.
 	 */
-	if (! main_ptr->suspended) {
+	if (! main->suspended) {
 	  (*in_row_ctr)--;
-	  main_ptr->suspended = TRUE;
+	  main->suspended = TRUE;
 	}
 	return;
       }
       /* We did finish the row.  Undo our little suspension hack if a previous
-       * call suspended; then mark the main_ptr buffer empty.
+       * call suspended; then mark the main buffer empty.
        */
-      if (main_ptr->suspended) {
+      if (main->suspended) {
 	(*in_row_ctr)++;
-	main_ptr->suspended = FALSE;
+	main->suspended = FALSE;
       }
     }
 
     /* If get here, we are done with this iMCU row.  Mark buffer empty. */
-    main_ptr->rowgroup_ctr = 0;
-    main_ptr->cur_iMCU_row++;
+    main->rowgroup_ctr = 0;
+    main->cur_iMCU_row++;
   }
 }
 
@@ -238,21 +238,21 @@ process_data_buffer_main (j_compress_ptr cinfo,
 
 
 /*
- * Initialize main_ptr buffer controller.
+ * Initialize main buffer controller.
  */
 
 GLOBAL(void)
 jinit_c_main_controller (j_compress_ptr cinfo, boolean need_full_buffer)
 {
-  my_main_ptr main_ptr;
+  my_main_ptr main;
   int ci;
   jpeg_component_info *compptr;
 
-  main_ptr = (my_main_ptr)
+  main = (my_main_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
 				SIZEOF(my_main_controller));
-  cinfo->main = (struct jpeg_c_main_controller *) main_ptr;
-  main_ptr->pub.start_pass = start_pass_main;
+  cinfo->main = (struct jpeg_c_main_controller *) main;
+  main->pub.start_pass = start_pass_main;
 
   /* We don't need to create a buffer in raw-data mode. */
   if (cinfo->raw_data_in)
@@ -267,27 +267,27 @@ jinit_c_main_controller (j_compress_ptr cinfo, boolean need_full_buffer)
     /* Note we pad the bottom to a multiple of the iMCU height */
     for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
 	 ci++, compptr++) {
-      main_ptr->whole_image[ci] = (*cinfo->mem->request_virt_sarray)
+      main->whole_image[ci] = (*cinfo->mem->request_virt_sarray)
 	((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE,
-	 compptr->width_in_blocks * DCTSIZE,
+	 compptr->width_in_blocks * compptr->DCT_h_scaled_size,
 	 (JDIMENSION) jround_up((long) compptr->height_in_blocks,
 				(long) compptr->v_samp_factor) * DCTSIZE,
-	 (JDIMENSION) (compptr->v_samp_factor * DCTSIZE));
+	 (JDIMENSION) (compptr->v_samp_factor * compptr->DCT_v_scaled_size));
     }
 #else
     ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
 #endif
   } else {
 #ifdef FULL_MAIN_BUFFER_SUPPORTED
-    main_ptr->whole_image[0] = NULL; /* flag for no virtual arrays */
+    main->whole_image[0] = NULL; /* flag for no virtual arrays */
 #endif
     /* Allocate a strip buffer for each component */
     for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
 	 ci++, compptr++) {
-      main_ptr->buffer[ci] = (*cinfo->mem->alloc_sarray)
+      main->buffer[ci] = (*cinfo->mem->alloc_sarray)
 	((j_common_ptr) cinfo, JPOOL_IMAGE,
-	 compptr->width_in_blocks * DCTSIZE,
-	 (JDIMENSION) (compptr->v_samp_factor * DCTSIZE));
+	 compptr->width_in_blocks * compptr->DCT_h_scaled_size,
+	 (JDIMENSION) (compptr->v_samp_factor * compptr->DCT_v_scaled_size));
     }
   }
 }
diff --git a/3rdparty/libjpeg/jcmarker.c b/3rdparty/libjpeg/jcmarker.c
index 3d1e6c6d52..606c19af39 100644
--- a/3rdparty/libjpeg/jcmarker.c
+++ b/3rdparty/libjpeg/jcmarker.c
@@ -2,6 +2,7 @@
  * jcmarker.c
  *
  * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Modified 2003-2010 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -153,21 +154,22 @@ emit_dqt (j_compress_ptr cinfo, int index)
     ERREXIT1(cinfo, JERR_NO_QUANT_TABLE, index);
 
   prec = 0;
-  for (i = 0; i < DCTSIZE2; i++) {
-    if (qtbl->quantval[i] > 255)
+  for (i = 0; i <= cinfo->lim_Se; i++) {
+    if (qtbl->quantval[cinfo->natural_order[i]] > 255)
       prec = 1;
   }
 
   if (! qtbl->sent_table) {
     emit_marker(cinfo, M_DQT);
 
-    emit_2bytes(cinfo, prec ? DCTSIZE2*2 + 1 + 2 : DCTSIZE2 + 1 + 2);
+    emit_2bytes(cinfo,
+      prec ? cinfo->lim_Se * 2 + 2 + 1 + 2 : cinfo->lim_Se + 1 + 1 + 2);
 
     emit_byte(cinfo, index + (prec<<4));
 
-    for (i = 0; i < DCTSIZE2; i++) {
+    for (i = 0; i <= cinfo->lim_Se; i++) {
       /* The table entries must be emitted in zigzag order. */
-      unsigned int qval = qtbl->quantval[jpeg_natural_order[i]];
+      unsigned int qval = qtbl->quantval[cinfo->natural_order[i]];
       if (prec)
 	emit_byte(cinfo, (int) (qval >> 8));
       emit_byte(cinfo, (int) (qval & 0xFF));
@@ -229,32 +231,38 @@ emit_dac (j_compress_ptr cinfo)
   char ac_in_use[NUM_ARITH_TBLS];
   int length, i;
   jpeg_component_info *compptr;
-  
+
   for (i = 0; i < NUM_ARITH_TBLS; i++)
     dc_in_use[i] = ac_in_use[i] = 0;
-  
+
   for (i = 0; i < cinfo->comps_in_scan; i++) {
     compptr = cinfo->cur_comp_info[i];
-    dc_in_use[compptr->dc_tbl_no] = 1;
-    ac_in_use[compptr->ac_tbl_no] = 1;
+    /* DC needs no table for refinement scan */
+    if (cinfo->Ss == 0 && cinfo->Ah == 0)
+      dc_in_use[compptr->dc_tbl_no] = 1;
+    /* AC needs no table when not present */
+    if (cinfo->Se)
+      ac_in_use[compptr->ac_tbl_no] = 1;
   }
-  
+
   length = 0;
   for (i = 0; i < NUM_ARITH_TBLS; i++)
     length += dc_in_use[i] + ac_in_use[i];
-  
-  emit_marker(cinfo, M_DAC);
-  
-  emit_2bytes(cinfo, length*2 + 2);
-  
-  for (i = 0; i < NUM_ARITH_TBLS; i++) {
-    if (dc_in_use[i]) {
-      emit_byte(cinfo, i);
-      emit_byte(cinfo, cinfo->arith_dc_L[i] + (cinfo->arith_dc_U[i]<<4));
-    }
-    if (ac_in_use[i]) {
-      emit_byte(cinfo, i + 0x10);
-      emit_byte(cinfo, cinfo->arith_ac_K[i]);
+
+  if (length) {
+    emit_marker(cinfo, M_DAC);
+
+    emit_2bytes(cinfo, length*2 + 2);
+
+    for (i = 0; i < NUM_ARITH_TBLS; i++) {
+      if (dc_in_use[i]) {
+	emit_byte(cinfo, i);
+	emit_byte(cinfo, cinfo->arith_dc_L[i] + (cinfo->arith_dc_U[i]<<4));
+      }
+      if (ac_in_use[i]) {
+	emit_byte(cinfo, i + 0x10);
+	emit_byte(cinfo, cinfo->arith_ac_K[i]);
+      }
     }
   }
 #endif /* C_ARITH_CODING_SUPPORTED */
@@ -285,13 +293,13 @@ emit_sof (j_compress_ptr cinfo, JPEG_MARKER code)
   emit_2bytes(cinfo, 3 * cinfo->num_components + 2 + 5 + 1); /* length */
 
   /* Make sure image isn't bigger than SOF field can handle */
-  if ((long) cinfo->image_height > 65535L ||
-      (long) cinfo->image_width > 65535L)
+  if ((long) cinfo->jpeg_height > 65535L ||
+      (long) cinfo->jpeg_width > 65535L)
     ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int) 65535);
 
   emit_byte(cinfo, cinfo->data_precision);
-  emit_2bytes(cinfo, (int) cinfo->image_height);
-  emit_2bytes(cinfo, (int) cinfo->image_width);
+  emit_2bytes(cinfo, (int) cinfo->jpeg_height);
+  emit_2bytes(cinfo, (int) cinfo->jpeg_width);
 
   emit_byte(cinfo, cinfo->num_components);
 
@@ -320,22 +328,16 @@ emit_sos (j_compress_ptr cinfo)
   for (i = 0; i < cinfo->comps_in_scan; i++) {
     compptr = cinfo->cur_comp_info[i];
     emit_byte(cinfo, compptr->component_id);
-    td = compptr->dc_tbl_no;
-    ta = compptr->ac_tbl_no;
-    if (cinfo->progressive_mode) {
-      /* Progressive mode: only DC or only AC tables are used in one scan;
-       * furthermore, Huffman coding of DC refinement uses no table at all.
-       * We emit 0 for unused field(s); this is recommended by the P&M text
-       * but does not seem to be specified in the standard.
-       */
-      if (cinfo->Ss == 0) {
-	ta = 0;			/* DC scan */
-	if (cinfo->Ah != 0 && !cinfo->arith_code)
-	  td = 0;		/* no DC table either */
-      } else {
-	td = 0;			/* AC scan */
-      }
-    }
+
+    /* We emit 0 for unused field(s); this is recommended by the P&M text
+     * but does not seem to be specified in the standard.
+     */
+
+    /* DC needs no table for refinement scan */
+    td = cinfo->Ss == 0 && cinfo->Ah == 0 ? compptr->dc_tbl_no : 0;
+    /* AC needs no table when not present */
+    ta = cinfo->Se ? compptr->ac_tbl_no : 0;
+
     emit_byte(cinfo, (td << 4) + ta);
   }
 
@@ -345,6 +347,22 @@ emit_sos (j_compress_ptr cinfo)
 }
 
 
+LOCAL(void)
+emit_pseudo_sos (j_compress_ptr cinfo)
+/* Emit a pseudo SOS marker */
+{
+  emit_marker(cinfo, M_SOS);
+  
+  emit_2bytes(cinfo, 2 + 1 + 3); /* length */
+  
+  emit_byte(cinfo, 0); /* Ns */
+
+  emit_byte(cinfo, 0); /* Ss */
+  emit_byte(cinfo, cinfo->block_size * cinfo->block_size - 1); /* Se */
+  emit_byte(cinfo, 0); /* Ah/Al */
+}
+
+
 LOCAL(void)
 emit_jfif_app0 (j_compress_ptr cinfo)
 /* Emit a JFIF-compliant APP0 marker */
@@ -484,7 +502,7 @@ write_file_header (j_compress_ptr cinfo)
 
 /*
  * Write frame header.
- * This consists of DQT and SOFn markers.
+ * This consists of DQT and SOFn markers, and a conditional pseudo SOS marker.
  * Note that we do not emit the SOF until we have emitted the DQT(s).
  * This avoids compatibility problems with incorrect implementations that
  * try to error-check the quant table numbers as soon as they see the SOF.
@@ -511,7 +529,7 @@ write_frame_header (j_compress_ptr cinfo)
    * Note we assume that Huffman table numbers won't be changed later.
    */
   if (cinfo->arith_code || cinfo->progressive_mode ||
-      cinfo->data_precision != 8) {
+      cinfo->data_precision != 8 || cinfo->block_size != DCTSIZE) {
     is_baseline = FALSE;
   } else {
     is_baseline = TRUE;
@@ -529,7 +547,10 @@ write_frame_header (j_compress_ptr cinfo)
 
   /* Emit the proper SOF marker */
   if (cinfo->arith_code) {
-    emit_sof(cinfo, M_SOF9);	/* SOF code for arithmetic coding */
+    if (cinfo->progressive_mode)
+      emit_sof(cinfo, M_SOF10); /* SOF code for progressive arithmetic */
+    else
+      emit_sof(cinfo, M_SOF9);  /* SOF code for sequential arithmetic */
   } else {
     if (cinfo->progressive_mode)
       emit_sof(cinfo, M_SOF2);	/* SOF code for progressive Huffman */
@@ -538,6 +559,10 @@ write_frame_header (j_compress_ptr cinfo)
     else
       emit_sof(cinfo, M_SOF1);	/* SOF code for non-baseline Huffman file */
   }
+
+  /* Check to emit pseudo SOS marker */
+  if (cinfo->progressive_mode && cinfo->block_size != DCTSIZE)
+    emit_pseudo_sos(cinfo);
 }
 
 
@@ -566,19 +591,12 @@ write_scan_header (j_compress_ptr cinfo)
      */
     for (i = 0; i < cinfo->comps_in_scan; i++) {
       compptr = cinfo->cur_comp_info[i];
-      if (cinfo->progressive_mode) {
-	/* Progressive mode: only DC or only AC tables are used in one scan */
-	if (cinfo->Ss == 0) {
-	  if (cinfo->Ah == 0)	/* DC needs no table for refinement scan */
-	    emit_dht(cinfo, compptr->dc_tbl_no, FALSE);
-	} else {
-	  emit_dht(cinfo, compptr->ac_tbl_no, TRUE);
-	}
-      } else {
-	/* Sequential mode: need both DC and AC tables */
+      /* DC needs no table for refinement scan */
+      if (cinfo->Ss == 0 && cinfo->Ah == 0)
 	emit_dht(cinfo, compptr->dc_tbl_no, FALSE);
+      /* AC needs no table when not present */
+      if (cinfo->Se)
 	emit_dht(cinfo, compptr->ac_tbl_no, TRUE);
-      }
     }
   }
 
diff --git a/3rdparty/libjpeg/jcmaster.c b/3rdparty/libjpeg/jcmaster.c
index aab4020b87..caf80a53b3 100644
--- a/3rdparty/libjpeg/jcmaster.c
+++ b/3rdparty/libjpeg/jcmaster.c
@@ -2,6 +2,7 @@
  * jcmaster.c
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2003-2011 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -42,23 +43,220 @@ typedef my_comp_master * my_master_ptr;
  * Support routines that do various essential calculations.
  */
 
+/*
+ * Compute JPEG image dimensions and related values.
+ * NOTE: this is exported for possible use by application.
+ * Hence it mustn't do anything that can't be done twice.
+ */
+
+GLOBAL(void)
+jpeg_calc_jpeg_dimensions (j_compress_ptr cinfo)
+/* Do computations that are needed before master selection phase */
+{
+#ifdef DCT_SCALING_SUPPORTED
+
+  /* Sanity check on input image dimensions to prevent overflow in
+   * following calculation.
+   * We do check jpeg_width and jpeg_height in initial_setup below,
+   * but image_width and image_height can come from arbitrary data,
+   * and we need some space for multiplication by block_size.
+   */
+  if (((long) cinfo->image_width >> 24) || ((long) cinfo->image_height >> 24))
+    ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int) JPEG_MAX_DIMENSION);
+
+  /* Compute actual JPEG image dimensions and DCT scaling choices. */
+  if (cinfo->scale_num >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/1 scaling */
+    cinfo->jpeg_width = cinfo->image_width * cinfo->block_size;
+    cinfo->jpeg_height = cinfo->image_height * cinfo->block_size;
+    cinfo->min_DCT_h_scaled_size = 1;
+    cinfo->min_DCT_v_scaled_size = 1;
+  } else if (cinfo->scale_num * 2 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/2 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 2L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 2L);
+    cinfo->min_DCT_h_scaled_size = 2;
+    cinfo->min_DCT_v_scaled_size = 2;
+  } else if (cinfo->scale_num * 3 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/3 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 3L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 3L);
+    cinfo->min_DCT_h_scaled_size = 3;
+    cinfo->min_DCT_v_scaled_size = 3;
+  } else if (cinfo->scale_num * 4 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/4 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 4L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 4L);
+    cinfo->min_DCT_h_scaled_size = 4;
+    cinfo->min_DCT_v_scaled_size = 4;
+  } else if (cinfo->scale_num * 5 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/5 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 5L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 5L);
+    cinfo->min_DCT_h_scaled_size = 5;
+    cinfo->min_DCT_v_scaled_size = 5;
+  } else if (cinfo->scale_num * 6 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/6 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 6L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 6L);
+    cinfo->min_DCT_h_scaled_size = 6;
+    cinfo->min_DCT_v_scaled_size = 6;
+  } else if (cinfo->scale_num * 7 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/7 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 7L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 7L);
+    cinfo->min_DCT_h_scaled_size = 7;
+    cinfo->min_DCT_v_scaled_size = 7;
+  } else if (cinfo->scale_num * 8 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/8 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 8L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 8L);
+    cinfo->min_DCT_h_scaled_size = 8;
+    cinfo->min_DCT_v_scaled_size = 8;
+  } else if (cinfo->scale_num * 9 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/9 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 9L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 9L);
+    cinfo->min_DCT_h_scaled_size = 9;
+    cinfo->min_DCT_v_scaled_size = 9;
+  } else if (cinfo->scale_num * 10 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/10 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 10L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 10L);
+    cinfo->min_DCT_h_scaled_size = 10;
+    cinfo->min_DCT_v_scaled_size = 10;
+  } else if (cinfo->scale_num * 11 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/11 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 11L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 11L);
+    cinfo->min_DCT_h_scaled_size = 11;
+    cinfo->min_DCT_v_scaled_size = 11;
+  } else if (cinfo->scale_num * 12 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/12 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 12L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 12L);
+    cinfo->min_DCT_h_scaled_size = 12;
+    cinfo->min_DCT_v_scaled_size = 12;
+  } else if (cinfo->scale_num * 13 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/13 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 13L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 13L);
+    cinfo->min_DCT_h_scaled_size = 13;
+    cinfo->min_DCT_v_scaled_size = 13;
+  } else if (cinfo->scale_num * 14 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/14 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 14L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 14L);
+    cinfo->min_DCT_h_scaled_size = 14;
+    cinfo->min_DCT_v_scaled_size = 14;
+  } else if (cinfo->scale_num * 15 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/15 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 15L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 15L);
+    cinfo->min_DCT_h_scaled_size = 15;
+    cinfo->min_DCT_v_scaled_size = 15;
+  } else {
+    /* Provide block_size/16 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 16L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 16L);
+    cinfo->min_DCT_h_scaled_size = 16;
+    cinfo->min_DCT_v_scaled_size = 16;
+  }
+
+#else /* !DCT_SCALING_SUPPORTED */
+
+  /* Hardwire it to "no scaling" */
+  cinfo->jpeg_width = cinfo->image_width;
+  cinfo->jpeg_height = cinfo->image_height;
+  cinfo->min_DCT_h_scaled_size = DCTSIZE;
+  cinfo->min_DCT_v_scaled_size = DCTSIZE;
+
+#endif /* DCT_SCALING_SUPPORTED */
+}
+
+
+LOCAL(void)
+jpeg_calc_trans_dimensions (j_compress_ptr cinfo)
+{
+  if (cinfo->min_DCT_h_scaled_size != cinfo->min_DCT_v_scaled_size)
+    ERREXIT2(cinfo, JERR_BAD_DCTSIZE,
+	     cinfo->min_DCT_h_scaled_size, cinfo->min_DCT_v_scaled_size);
+
+  cinfo->block_size = cinfo->min_DCT_h_scaled_size;
+}
+
+
 LOCAL(void)
-initial_setup (j_compress_ptr cinfo)
+initial_setup (j_compress_ptr cinfo, boolean transcode_only)
 /* Do computations that are needed before master selection phase */
 {
-  int ci;
+  int ci, ssize;
   jpeg_component_info *compptr;
   long samplesperrow;
   JDIMENSION jd_samplesperrow;
 
+  if (transcode_only)
+    jpeg_calc_trans_dimensions(cinfo);
+  else
+    jpeg_calc_jpeg_dimensions(cinfo);
+
+  /* Sanity check on block_size */
+  if (cinfo->block_size < 1 || cinfo->block_size > 16)
+    ERREXIT2(cinfo, JERR_BAD_DCTSIZE, cinfo->block_size, cinfo->block_size);
+
+  /* Derive natural_order from block_size */
+  switch (cinfo->block_size) {
+  case 2: cinfo->natural_order = jpeg_natural_order2; break;
+  case 3: cinfo->natural_order = jpeg_natural_order3; break;
+  case 4: cinfo->natural_order = jpeg_natural_order4; break;
+  case 5: cinfo->natural_order = jpeg_natural_order5; break;
+  case 6: cinfo->natural_order = jpeg_natural_order6; break;
+  case 7: cinfo->natural_order = jpeg_natural_order7; break;
+  default: cinfo->natural_order = jpeg_natural_order; break;
+  }
+
+  /* Derive lim_Se from block_size */
+  cinfo->lim_Se = cinfo->block_size < DCTSIZE ?
+    cinfo->block_size * cinfo->block_size - 1 : DCTSIZE2-1;
+
   /* Sanity check on image dimensions */
-  if (cinfo->image_height <= 0 || cinfo->image_width <= 0
-      || cinfo->num_components <= 0 || cinfo->input_components <= 0)
+  if (cinfo->jpeg_height <= 0 || cinfo->jpeg_width <= 0 ||
+      cinfo->num_components <= 0 || cinfo->input_components <= 0)
     ERREXIT(cinfo, JERR_EMPTY_IMAGE);
 
   /* Make sure image isn't bigger than I can handle */
-  if ((long) cinfo->image_height > (long) JPEG_MAX_DIMENSION ||
-      (long) cinfo->image_width > (long) JPEG_MAX_DIMENSION)
+  if ((long) cinfo->jpeg_height > (long) JPEG_MAX_DIMENSION ||
+      (long) cinfo->jpeg_width > (long) JPEG_MAX_DIMENSION)
     ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int) JPEG_MAX_DIMENSION);
 
   /* Width of an input scanline must be representable as JDIMENSION. */
@@ -95,22 +293,52 @@ initial_setup (j_compress_ptr cinfo)
        ci++, compptr++) {
     /* Fill in the correct component_index value; don't rely on application */
     compptr->component_index = ci;
-    /* For compression, we never do DCT scaling. */
-    compptr->DCT_scaled_size = DCTSIZE;
+    /* In selecting the actual DCT scaling for each component, we try to
+     * scale down the chroma components via DCT scaling rather than downsampling.
+     * This saves time if the downsampler gets to use 1:1 scaling.
+     * Note this code adapts subsampling ratios which are powers of 2.
+     */
+    ssize = 1;
+#ifdef DCT_SCALING_SUPPORTED
+    while (cinfo->min_DCT_h_scaled_size * ssize <=
+	   (cinfo->do_fancy_downsampling ? DCTSIZE : DCTSIZE / 2) &&
+	   (cinfo->max_h_samp_factor % (compptr->h_samp_factor * ssize * 2)) == 0) {
+      ssize = ssize * 2;
+    }
+#endif
+    compptr->DCT_h_scaled_size = cinfo->min_DCT_h_scaled_size * ssize;
+    ssize = 1;
+#ifdef DCT_SCALING_SUPPORTED
+    while (cinfo->min_DCT_v_scaled_size * ssize <=
+	   (cinfo->do_fancy_downsampling ? DCTSIZE : DCTSIZE / 2) &&
+	   (cinfo->max_v_samp_factor % (compptr->v_samp_factor * ssize * 2)) == 0) {
+      ssize = ssize * 2;
+    }
+#endif
+    compptr->DCT_v_scaled_size = cinfo->min_DCT_v_scaled_size * ssize;
+
+    /* We don't support DCT ratios larger than 2. */
+    if (compptr->DCT_h_scaled_size > compptr->DCT_v_scaled_size * 2)
+	compptr->DCT_h_scaled_size = compptr->DCT_v_scaled_size * 2;
+    else if (compptr->DCT_v_scaled_size > compptr->DCT_h_scaled_size * 2)
+	compptr->DCT_v_scaled_size = compptr->DCT_h_scaled_size * 2;
+
     /* Size in DCT blocks */
     compptr->width_in_blocks = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor,
-		    (long) (cinfo->max_h_samp_factor * DCTSIZE));
+      jdiv_round_up((long) cinfo->jpeg_width * (long) compptr->h_samp_factor,
+		    (long) (cinfo->max_h_samp_factor * cinfo->block_size));
     compptr->height_in_blocks = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * (long) compptr->v_samp_factor,
-		    (long) (cinfo->max_v_samp_factor * DCTSIZE));
+      jdiv_round_up((long) cinfo->jpeg_height * (long) compptr->v_samp_factor,
+		    (long) (cinfo->max_v_samp_factor * cinfo->block_size));
     /* Size in samples */
     compptr->downsampled_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor,
-		    (long) cinfo->max_h_samp_factor);
+      jdiv_round_up((long) cinfo->jpeg_width *
+		    (long) (compptr->h_samp_factor * compptr->DCT_h_scaled_size),
+		    (long) (cinfo->max_h_samp_factor * cinfo->block_size));
     compptr->downsampled_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * (long) compptr->v_samp_factor,
-		    (long) cinfo->max_v_samp_factor);
+      jdiv_round_up((long) cinfo->jpeg_height *
+		    (long) (compptr->v_samp_factor * compptr->DCT_v_scaled_size),
+		    (long) (cinfo->max_v_samp_factor * cinfo->block_size));
     /* Mark component needed (this flag isn't actually used for compression) */
     compptr->component_needed = TRUE;
   }
@@ -119,8 +347,8 @@ initial_setup (j_compress_ptr cinfo)
    * main controller will call coefficient controller).
    */
   cinfo->total_iMCU_rows = (JDIMENSION)
-    jdiv_round_up((long) cinfo->image_height,
-		  (long) (cinfo->max_v_samp_factor*DCTSIZE));
+    jdiv_round_up((long) cinfo->jpeg_height,
+		  (long) (cinfo->max_v_samp_factor * cinfo->block_size));
 }
 
 
@@ -260,6 +488,39 @@ validate_script (j_compress_ptr cinfo)
   }
 }
 
+
+LOCAL(void)
+reduce_script (j_compress_ptr cinfo)
+/* Adapt scan script for use with reduced block size;
+ * assume that script has been validated before.
+ */
+{
+  jpeg_scan_info * scanptr;
+  int idxout, idxin;
+
+  /* Circumvent const declaration for this function */
+  scanptr = (jpeg_scan_info *) cinfo->scan_info;
+  idxout = 0;
+
+  for (idxin = 0; idxin < cinfo->num_scans; idxin++) {
+    /* After skipping, idxout becomes smaller than idxin */
+    if (idxin != idxout)
+      /* Copy rest of data;
+       * note we stay in given chunk of allocated memory.
+       */
+      scanptr[idxout] = scanptr[idxin];
+    if (scanptr[idxout].Ss > cinfo->lim_Se)
+      /* Entire scan out of range - skip this entry */
+      continue;
+    if (scanptr[idxout].Se > cinfo->lim_Se)
+      /* Limit scan to end of block */
+      scanptr[idxout].Se = cinfo->lim_Se;
+    idxout++;
+  }
+
+  cinfo->num_scans = idxout;
+}
+
 #endif /* C_MULTISCAN_FILES_SUPPORTED */
 
 
@@ -280,10 +541,13 @@ select_scan_parameters (j_compress_ptr cinfo)
       cinfo->cur_comp_info[ci] =
 	&cinfo->comp_info[scanptr->component_index[ci]];
     }
-    cinfo->Ss = scanptr->Ss;
-    cinfo->Se = scanptr->Se;
-    cinfo->Ah = scanptr->Ah;
-    cinfo->Al = scanptr->Al;
+    if (cinfo->progressive_mode) {
+      cinfo->Ss = scanptr->Ss;
+      cinfo->Se = scanptr->Se;
+      cinfo->Ah = scanptr->Ah;
+      cinfo->Al = scanptr->Al;
+      return;
+    }
   }
   else
 #endif
@@ -296,11 +560,11 @@ select_scan_parameters (j_compress_ptr cinfo)
     for (ci = 0; ci < cinfo->num_components; ci++) {
       cinfo->cur_comp_info[ci] = &cinfo->comp_info[ci];
     }
-    cinfo->Ss = 0;
-    cinfo->Se = DCTSIZE2-1;
-    cinfo->Ah = 0;
-    cinfo->Al = 0;
   }
+  cinfo->Ss = 0;
+  cinfo->Se = cinfo->block_size * cinfo->block_size - 1;
+  cinfo->Ah = 0;
+  cinfo->Al = 0;
 }
 
 
@@ -325,7 +589,7 @@ per_scan_setup (j_compress_ptr cinfo)
     compptr->MCU_width = 1;
     compptr->MCU_height = 1;
     compptr->MCU_blocks = 1;
-    compptr->MCU_sample_width = DCTSIZE;
+    compptr->MCU_sample_width = compptr->DCT_h_scaled_size;
     compptr->last_col_width = 1;
     /* For noninterleaved scans, it is convenient to define last_row_height
      * as the number of block rows present in the last iMCU row.
@@ -347,11 +611,11 @@ per_scan_setup (j_compress_ptr cinfo)
     
     /* Overall image size in MCUs */
     cinfo->MCUs_per_row = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width,
-		    (long) (cinfo->max_h_samp_factor*DCTSIZE));
+      jdiv_round_up((long) cinfo->jpeg_width,
+		    (long) (cinfo->max_h_samp_factor * cinfo->block_size));
     cinfo->MCU_rows_in_scan = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height,
-		    (long) (cinfo->max_v_samp_factor*DCTSIZE));
+      jdiv_round_up((long) cinfo->jpeg_height,
+		    (long) (cinfo->max_v_samp_factor * cinfo->block_size));
     
     cinfo->blocks_in_MCU = 0;
     
@@ -361,7 +625,7 @@ per_scan_setup (j_compress_ptr cinfo)
       compptr->MCU_width = compptr->h_samp_factor;
       compptr->MCU_height = compptr->v_samp_factor;
       compptr->MCU_blocks = compptr->MCU_width * compptr->MCU_height;
-      compptr->MCU_sample_width = compptr->MCU_width * DCTSIZE;
+      compptr->MCU_sample_width = compptr->MCU_width * compptr->DCT_h_scaled_size;
       /* Figure number of non-dummy blocks in last MCU column & row */
       tmp = (int) (compptr->width_in_blocks % compptr->MCU_width);
       if (tmp == 0) tmp = compptr->MCU_width;
@@ -433,7 +697,7 @@ prepare_for_pass (j_compress_ptr cinfo)
     /* Do Huffman optimization for a scan after the first one. */
     select_scan_parameters(cinfo);
     per_scan_setup(cinfo);
-    if (cinfo->Ss != 0 || cinfo->Ah == 0 || cinfo->arith_code) {
+    if (cinfo->Ss != 0 || cinfo->Ah == 0) {
       (*cinfo->entropy->start_pass) (cinfo, TRUE);
       (*cinfo->coef->start_pass) (cinfo, JBUF_CRANK_DEST);
       master->pub.call_pass_startup = FALSE;
@@ -554,11 +818,13 @@ jinit_c_master_control (j_compress_ptr cinfo, boolean transcode_only)
   master->pub.is_last_pass = FALSE;
 
   /* Validate parameters, determine derived values */
-  initial_setup(cinfo);
+  initial_setup(cinfo, transcode_only);
 
   if (cinfo->scan_info != NULL) {
 #ifdef C_MULTISCAN_FILES_SUPPORTED
     validate_script(cinfo);
+    if (cinfo->block_size < DCTSIZE)
+      reduce_script(cinfo);
 #else
     ERREXIT(cinfo, JERR_NOT_COMPILED);
 #endif
@@ -567,8 +833,10 @@ jinit_c_master_control (j_compress_ptr cinfo, boolean transcode_only)
     cinfo->num_scans = 1;
   }
 
-  if (cinfo->progressive_mode)	/*  TEMPORARY HACK ??? */
-    cinfo->optimize_coding = TRUE; /* assume default tables no good for progressive mode */
+  if ((cinfo->progressive_mode || cinfo->block_size < DCTSIZE) &&
+      !cinfo->arith_code)			/*  TEMPORARY HACK ??? */
+    /* assume default tables no good for progressive or downscale mode */
+    cinfo->optimize_coding = TRUE;
 
   /* Initialize my private state */
   if (transcode_only) {
diff --git a/3rdparty/libjpeg/jconfig.h b/3rdparty/libjpeg/jconfig.h
index f5269dfe18..025ff5a304 100644
--- a/3rdparty/libjpeg/jconfig.h
+++ b/3rdparty/libjpeg/jconfig.h
@@ -1,6 +1,6 @@
 /* jconfig.h.  Generated automatically by configure.  */
 /* jconfig.cfg --- source file edited by configure script */
-/* see jconfig.doc for explanations */
+/* see jconfig.txt for explanations */
 
 #define HAVE_PROTOTYPES 
 #define HAVE_UNSIGNED_CHAR 
diff --git a/3rdparty/libjpeg/jcparam.c b/3rdparty/libjpeg/jcparam.c
index 6fc48f5365..c5e85dda55 100644
--- a/3rdparty/libjpeg/jcparam.c
+++ b/3rdparty/libjpeg/jcparam.c
@@ -2,6 +2,7 @@
  * jcparam.c
  *
  * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Modified 2003-2008 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -60,6 +61,47 @@ jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
 }
 
 
+/* These are the sample quantization tables given in JPEG spec section K.1.
+ * The spec says that the values given produce "good" quality, and
+ * when divided by 2, "very good" quality.
+ */
+static const unsigned int std_luminance_quant_tbl[DCTSIZE2] = {
+  16,  11,  10,  16,  24,  40,  51,  61,
+  12,  12,  14,  19,  26,  58,  60,  55,
+  14,  13,  16,  24,  40,  57,  69,  56,
+  14,  17,  22,  29,  51,  87,  80,  62,
+  18,  22,  37,  56,  68, 109, 103,  77,
+  24,  35,  55,  64,  81, 104, 113,  92,
+  49,  64,  78,  87, 103, 121, 120, 101,
+  72,  92,  95,  98, 112, 100, 103,  99
+};
+static const unsigned int std_chrominance_quant_tbl[DCTSIZE2] = {
+  17,  18,  24,  47,  99,  99,  99,  99,
+  18,  21,  26,  66,  99,  99,  99,  99,
+  24,  26,  56,  99,  99,  99,  99,  99,
+  47,  66,  99,  99,  99,  99,  99,  99,
+  99,  99,  99,  99,  99,  99,  99,  99,
+  99,  99,  99,  99,  99,  99,  99,  99,
+  99,  99,  99,  99,  99,  99,  99,  99,
+  99,  99,  99,  99,  99,  99,  99,  99
+};
+
+
+GLOBAL(void)
+jpeg_default_qtables (j_compress_ptr cinfo, boolean force_baseline)
+/* Set or change the 'quality' (quantization) setting, using default tables
+ * and straight percentage-scaling quality scales.
+ * This entry point allows different scalings for luminance and chrominance.
+ */
+{
+  /* Set up two quantization tables using the specified scaling */
+  jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl,
+		       cinfo->q_scale_factor[0], force_baseline);
+  jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl,
+		       cinfo->q_scale_factor[1], force_baseline);
+}
+
+
 GLOBAL(void)
 jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
 			 boolean force_baseline)
@@ -69,31 +111,6 @@ jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
  * applications that insist on a linear percentage scaling.
  */
 {
-  /* These are the sample quantization tables given in JPEG spec section K.1.
-   * The spec says that the values given produce "good" quality, and
-   * when divided by 2, "very good" quality.
-   */
-  static const unsigned int std_luminance_quant_tbl[DCTSIZE2] = {
-    16,  11,  10,  16,  24,  40,  51,  61,
-    12,  12,  14,  19,  26,  58,  60,  55,
-    14,  13,  16,  24,  40,  57,  69,  56,
-    14,  17,  22,  29,  51,  87,  80,  62,
-    18,  22,  37,  56,  68, 109, 103,  77,
-    24,  35,  55,  64,  81, 104, 113,  92,
-    49,  64,  78,  87, 103, 121, 120, 101,
-    72,  92,  95,  98, 112, 100, 103,  99
-  };
-  static const unsigned int std_chrominance_quant_tbl[DCTSIZE2] = {
-    17,  18,  24,  47,  99,  99,  99,  99,
-    18,  21,  26,  66,  99,  99,  99,  99,
-    24,  26,  56,  99,  99,  99,  99,  99,
-    47,  66,  99,  99,  99,  99,  99,  99,
-    99,  99,  99,  99,  99,  99,  99,  99,
-    99,  99,  99,  99,  99,  99,  99,  99,
-    99,  99,  99,  99,  99,  99,  99,  99,
-    99,  99,  99,  99,  99,  99,  99,  99
-  };
-
   /* Set up two quantization tables using the specified scaling */
   jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl,
 		       scale_factor, force_baseline);
@@ -284,6 +301,8 @@ jpeg_set_defaults (j_compress_ptr cinfo)
 
   /* Initialize everything not dependent on the color space */
 
+  cinfo->scale_num = 1;		/* 1:1 scaling */
+  cinfo->scale_denom = 1;
   cinfo->data_precision = BITS_IN_JSAMPLE;
   /* Set up two quantization tables using default quality of 75 */
   jpeg_set_quality(cinfo, 75, TRUE);
@@ -320,6 +339,9 @@ jpeg_set_defaults (j_compress_ptr cinfo)
   /* By default, use the simpler non-cosited sampling alignment */
   cinfo->CCIR601_sampling = FALSE;
 
+  /* By default, apply fancy downsampling */
+  cinfo->do_fancy_downsampling = TRUE;
+
   /* No input smoothing */
   cinfo->smoothing_factor = 0;
 
diff --git a/3rdparty/libjpeg/jcphuff.c b/3rdparty/libjpeg/jcphuff.c
deleted file mode 100644
index 07f9178b01..0000000000
--- a/3rdparty/libjpeg/jcphuff.c
+++ /dev/null
@@ -1,833 +0,0 @@
-/*
- * jcphuff.c
- *
- * Copyright (C) 1995-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file contains Huffman entropy encoding routines for progressive JPEG.
- *
- * We do not support output suspension in this module, since the library
- * currently does not allow multiple-scan files to be written with output
- * suspension.
- */
-
-#define JPEG_INTERNALS
-#include "jinclude.h"
-#include "jpeglib.h"
-#include "jchuff.h"		/* Declarations shared with jchuff.c */
-
-#ifdef C_PROGRESSIVE_SUPPORTED
-
-/* Expanded entropy encoder object for progressive Huffman encoding. */
-
-typedef struct {
-  struct jpeg_entropy_encoder pub; /* public fields */
-
-  /* Mode flag: TRUE for optimization, FALSE for actual data output */
-  boolean gather_statistics;
-
-  /* Bit-level coding status.
-   * next_output_byte/free_in_buffer are local copies of cinfo->dest fields.
-   */
-  JOCTET * next_output_byte;	/* => next byte to write in buffer */
-  size_t free_in_buffer;	/* # of byte spaces remaining in buffer */
-  INT32 put_buffer;		/* current bit-accumulation buffer */
-  int put_bits;			/* # of bits now in it */
-  j_compress_ptr cinfo;		/* link to cinfo (needed for dump_buffer) */
-
-  /* Coding status for DC components */
-  int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
-
-  /* Coding status for AC components */
-  int ac_tbl_no;		/* the table number of the single component */
-  unsigned int EOBRUN;		/* run length of EOBs */
-  unsigned int BE;		/* # of buffered correction bits before MCU */
-  char * bit_buffer;		/* buffer for correction bits (1 per char) */
-  /* packing correction bits tightly would save some space but cost time... */
-
-  unsigned int restarts_to_go;	/* MCUs left in this restart interval */
-  int next_restart_num;		/* next restart number to write (0-7) */
-
-  /* Pointers to derived tables (these workspaces have image lifespan).
-   * Since any one scan codes only DC or only AC, we only need one set
-   * of tables, not one for DC and one for AC.
-   */
-  c_derived_tbl * derived_tbls[NUM_HUFF_TBLS];
-
-  /* Statistics tables for optimization; again, one set is enough */
-  long * count_ptrs[NUM_HUFF_TBLS];
-} phuff_entropy_encoder;
-
-typedef phuff_entropy_encoder * phuff_entropy_ptr;
-
-/* MAX_CORR_BITS is the number of bits the AC refinement correction-bit
- * buffer can hold.  Larger sizes may slightly improve compression, but
- * 1000 is already well into the realm of overkill.
- * The minimum safe size is 64 bits.
- */
-
-#define MAX_CORR_BITS  1000	/* Max # of correction bits I can buffer */
-
-/* IRIGHT_SHIFT is like RIGHT_SHIFT, but works on int rather than INT32.
- * We assume that int right shift is unsigned if INT32 right shift is,
- * which should be safe.
- */
-
-#ifdef RIGHT_SHIFT_IS_UNSIGNED
-#define ISHIFT_TEMPS	int ishift_temp;
-#define IRIGHT_SHIFT(x,shft)  \
-	((ishift_temp = (x)) < 0 ? \
-	 (ishift_temp >> (shft)) | ((~0) << (16-(shft))) : \
-	 (ishift_temp >> (shft)))
-#else
-#define ISHIFT_TEMPS
-#define IRIGHT_SHIFT(x,shft)	((x) >> (shft))
-#endif
-
-/* Forward declarations */
-METHODDEF(boolean) encode_mcu_DC_first JPP((j_compress_ptr cinfo,
-					    JBLOCKROW *MCU_data));
-METHODDEF(boolean) encode_mcu_AC_first JPP((j_compress_ptr cinfo,
-					    JBLOCKROW *MCU_data));
-METHODDEF(boolean) encode_mcu_DC_refine JPP((j_compress_ptr cinfo,
-					     JBLOCKROW *MCU_data));
-METHODDEF(boolean) encode_mcu_AC_refine JPP((j_compress_ptr cinfo,
-					     JBLOCKROW *MCU_data));
-METHODDEF(void) finish_pass_phuff JPP((j_compress_ptr cinfo));
-METHODDEF(void) finish_pass_gather_phuff JPP((j_compress_ptr cinfo));
-
-
-/*
- * Initialize for a Huffman-compressed scan using progressive JPEG.
- */
-
-METHODDEF(void)
-start_pass_phuff (j_compress_ptr cinfo, boolean gather_statistics)
-{  
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  boolean is_DC_band;
-  int ci, tbl;
-  jpeg_component_info * compptr;
-
-  entropy->cinfo = cinfo;
-  entropy->gather_statistics = gather_statistics;
-
-  is_DC_band = (cinfo->Ss == 0);
-
-  /* We assume jcmaster.c already validated the scan parameters. */
-
-  /* Select execution routines */
-  if (cinfo->Ah == 0) {
-    if (is_DC_band)
-      entropy->pub.encode_mcu = encode_mcu_DC_first;
-    else
-      entropy->pub.encode_mcu = encode_mcu_AC_first;
-  } else {
-    if (is_DC_band)
-      entropy->pub.encode_mcu = encode_mcu_DC_refine;
-    else {
-      entropy->pub.encode_mcu = encode_mcu_AC_refine;
-      /* AC refinement needs a correction bit buffer */
-      if (entropy->bit_buffer == NULL)
-	entropy->bit_buffer = (char *)
-	  (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				      MAX_CORR_BITS * SIZEOF(char));
-    }
-  }
-  if (gather_statistics)
-    entropy->pub.finish_pass = finish_pass_gather_phuff;
-  else
-    entropy->pub.finish_pass = finish_pass_phuff;
-
-  /* Only DC coefficients may be interleaved, so cinfo->comps_in_scan = 1
-   * for AC coefficients.
-   */
-  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
-    compptr = cinfo->cur_comp_info[ci];
-    /* Initialize DC predictions to 0 */
-    entropy->last_dc_val[ci] = 0;
-    /* Get table index */
-    if (is_DC_band) {
-      if (cinfo->Ah != 0)	/* DC refinement needs no table */
-	continue;
-      tbl = compptr->dc_tbl_no;
-    } else {
-      entropy->ac_tbl_no = tbl = compptr->ac_tbl_no;
-    }
-    if (gather_statistics) {
-      /* Check for invalid table index */
-      /* (make_c_derived_tbl does this in the other path) */
-      if (tbl < 0 || tbl >= NUM_HUFF_TBLS)
-        ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, tbl);
-      /* Allocate and zero the statistics tables */
-      /* Note that jpeg_gen_optimal_table expects 257 entries in each table! */
-      if (entropy->count_ptrs[tbl] == NULL)
-	entropy->count_ptrs[tbl] = (long *)
-	  (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				      257 * SIZEOF(long));
-      MEMZERO(entropy->count_ptrs[tbl], 257 * SIZEOF(long));
-    } else {
-      /* Compute derived values for Huffman table */
-      /* We may do this more than once for a table, but it's not expensive */
-      jpeg_make_c_derived_tbl(cinfo, is_DC_band, tbl,
-			      & entropy->derived_tbls[tbl]);
-    }
-  }
-
-  /* Initialize AC stuff */
-  entropy->EOBRUN = 0;
-  entropy->BE = 0;
-
-  /* Initialize bit buffer to empty */
-  entropy->put_buffer = 0;
-  entropy->put_bits = 0;
-
-  /* Initialize restart stuff */
-  entropy->restarts_to_go = cinfo->restart_interval;
-  entropy->next_restart_num = 0;
-}
-
-
-/* Outputting bytes to the file.
- * NB: these must be called only when actually outputting,
- * that is, entropy->gather_statistics == FALSE.
- */
-
-/* Emit a byte */
-#define emit_byte(entropy,val)  \
-	{ *(entropy)->next_output_byte++ = (JOCTET) (val);  \
-	  if (--(entropy)->free_in_buffer == 0)  \
-	    dump_buffer(entropy); }
-
-
-LOCAL(void)
-dump_buffer (phuff_entropy_ptr entropy)
-/* Empty the output buffer; we do not support suspension in this module. */
-{
-  struct jpeg_destination_mgr * dest = entropy->cinfo->dest;
-
-  if (! (*dest->empty_output_buffer) (entropy->cinfo))
-    ERREXIT(entropy->cinfo, JERR_CANT_SUSPEND);
-  /* After a successful buffer dump, must reset buffer pointers */
-  entropy->next_output_byte = dest->next_output_byte;
-  entropy->free_in_buffer = dest->free_in_buffer;
-}
-
-
-/* Outputting bits to the file */
-
-/* Only the right 24 bits of put_buffer are used; the valid bits are
- * left-justified in this part.  At most 16 bits can be passed to emit_bits
- * in one call, and we never retain more than 7 bits in put_buffer
- * between calls, so 24 bits are sufficient.
- */
-
-INLINE
-LOCAL(void)
-emit_bits (phuff_entropy_ptr entropy, unsigned int code, int size)
-/* Emit some bits, unless we are in gather mode */
-{
-  /* This routine is heavily used, so it's worth coding tightly. */
-  register INT32 put_buffer = (INT32) code;
-  register int put_bits = entropy->put_bits;
-
-  /* if size is 0, caller used an invalid Huffman table entry */
-  if (size == 0)
-    ERREXIT(entropy->cinfo, JERR_HUFF_MISSING_CODE);
-
-  if (entropy->gather_statistics)
-    return;			/* do nothing if we're only getting stats */
-
-  put_buffer &= (((INT32) 1)<<size) - 1; /* mask off any extra bits in code */
-  
-  put_bits += size;		/* new number of bits in buffer */
-  
-  put_buffer <<= 24 - put_bits; /* align incoming bits */
-
-  put_buffer |= entropy->put_buffer; /* and merge with old buffer contents */
-
-  while (put_bits >= 8) {
-    int c = (int) ((put_buffer >> 16) & 0xFF);
-    
-    emit_byte(entropy, c);
-    if (c == 0xFF) {		/* need to stuff a zero byte? */
-      emit_byte(entropy, 0);
-    }
-    put_buffer <<= 8;
-    put_bits -= 8;
-  }
-
-  entropy->put_buffer = put_buffer; /* update variables */
-  entropy->put_bits = put_bits;
-}
-
-
-LOCAL(void)
-flush_bits (phuff_entropy_ptr entropy)
-{
-  emit_bits(entropy, 0x7F, 7); /* fill any partial byte with ones */
-  entropy->put_buffer = 0;     /* and reset bit-buffer to empty */
-  entropy->put_bits = 0;
-}
-
-
-/*
- * Emit (or just count) a Huffman symbol.
- */
-
-INLINE
-LOCAL(void)
-emit_symbol (phuff_entropy_ptr entropy, int tbl_no, int symbol)
-{
-  if (entropy->gather_statistics)
-    entropy->count_ptrs[tbl_no][symbol]++;
-  else {
-    c_derived_tbl * tbl = entropy->derived_tbls[tbl_no];
-    emit_bits(entropy, tbl->ehufco[symbol], tbl->ehufsi[symbol]);
-  }
-}
-
-
-/*
- * Emit bits from a correction bit buffer.
- */
-
-LOCAL(void)
-emit_buffered_bits (phuff_entropy_ptr entropy, char * bufstart,
-		    unsigned int nbits)
-{
-  if (entropy->gather_statistics)
-    return;			/* no real work */
-
-  while (nbits > 0) {
-    emit_bits(entropy, (unsigned int) (*bufstart), 1);
-    bufstart++;
-    nbits--;
-  }
-}
-
-
-/*
- * Emit any pending EOBRUN symbol.
- */
-
-LOCAL(void)
-emit_eobrun (phuff_entropy_ptr entropy)
-{
-  register int temp, nbits;
-
-  if (entropy->EOBRUN > 0) {	/* if there is any pending EOBRUN */
-    temp = entropy->EOBRUN;
-    nbits = 0;
-    while ((temp >>= 1))
-      nbits++;
-    /* safety check: shouldn't happen given limited correction-bit buffer */
-    if (nbits > 14)
-      ERREXIT(entropy->cinfo, JERR_HUFF_MISSING_CODE);
-
-    emit_symbol(entropy, entropy->ac_tbl_no, nbits << 4);
-    if (nbits)
-      emit_bits(entropy, entropy->EOBRUN, nbits);
-
-    entropy->EOBRUN = 0;
-
-    /* Emit any buffered correction bits */
-    emit_buffered_bits(entropy, entropy->bit_buffer, entropy->BE);
-    entropy->BE = 0;
-  }
-}
-
-
-/*
- * Emit a restart marker & resynchronize predictions.
- */
-
-LOCAL(void)
-emit_restart (phuff_entropy_ptr entropy, int restart_num)
-{
-  int ci;
-
-  emit_eobrun(entropy);
-
-  if (! entropy->gather_statistics) {
-    flush_bits(entropy);
-    emit_byte(entropy, 0xFF);
-    emit_byte(entropy, JPEG_RST0 + restart_num);
-  }
-
-  if (entropy->cinfo->Ss == 0) {
-    /* Re-initialize DC predictions to 0 */
-    for (ci = 0; ci < entropy->cinfo->comps_in_scan; ci++)
-      entropy->last_dc_val[ci] = 0;
-  } else {
-    /* Re-initialize all AC-related fields to 0 */
-    entropy->EOBRUN = 0;
-    entropy->BE = 0;
-  }
-}
-
-
-/*
- * MCU encoding for DC initial scan (either spectral selection,
- * or first pass of successive approximation).
- */
-
-METHODDEF(boolean)
-encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
-{
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  register int temp, temp2;
-  register int nbits;
-  int blkn, ci;
-  int Al = cinfo->Al;
-  JBLOCKROW block;
-  jpeg_component_info * compptr;
-  ISHIFT_TEMPS
-
-  entropy->next_output_byte = cinfo->dest->next_output_byte;
-  entropy->free_in_buffer = cinfo->dest->free_in_buffer;
-
-  /* Emit restart marker if needed */
-  if (cinfo->restart_interval)
-    if (entropy->restarts_to_go == 0)
-      emit_restart(entropy, entropy->next_restart_num);
-
-  /* Encode the MCU data blocks */
-  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
-    block = MCU_data[blkn];
-    ci = cinfo->MCU_membership[blkn];
-    compptr = cinfo->cur_comp_info[ci];
-
-    /* Compute the DC value after the required point transform by Al.
-     * This is simply an arithmetic right shift.
-     */
-    temp2 = IRIGHT_SHIFT((int) ((*block)[0]), Al);
-
-    /* DC differences are figured on the point-transformed values. */
-    temp = temp2 - entropy->last_dc_val[ci];
-    entropy->last_dc_val[ci] = temp2;
-
-    /* Encode the DC coefficient difference per section G.1.2.1 */
-    temp2 = temp;
-    if (temp < 0) {
-      temp = -temp;		/* temp is abs value of input */
-      /* For a negative input, want temp2 = bitwise complement of abs(input) */
-      /* This code assumes we are on a two's complement machine */
-      temp2--;
-    }
-    
-    /* Find the number of bits needed for the magnitude of the coefficient */
-    nbits = 0;
-    while (temp) {
-      nbits++;
-      temp >>= 1;
-    }
-    /* Check for out-of-range coefficient values.
-     * Since we're encoding a difference, the range limit is twice as much.
-     */
-    if (nbits > MAX_COEF_BITS+1)
-      ERREXIT(cinfo, JERR_BAD_DCT_COEF);
-    
-    /* Count/emit the Huffman-coded symbol for the number of bits */
-    emit_symbol(entropy, compptr->dc_tbl_no, nbits);
-    
-    /* Emit that number of bits of the value, if positive, */
-    /* or the complement of its magnitude, if negative. */
-    if (nbits)			/* emit_bits rejects calls with size 0 */
-      emit_bits(entropy, (unsigned int) temp2, nbits);
-  }
-
-  cinfo->dest->next_output_byte = entropy->next_output_byte;
-  cinfo->dest->free_in_buffer = entropy->free_in_buffer;
-
-  /* Update restart-interval state too */
-  if (cinfo->restart_interval) {
-    if (entropy->restarts_to_go == 0) {
-      entropy->restarts_to_go = cinfo->restart_interval;
-      entropy->next_restart_num++;
-      entropy->next_restart_num &= 7;
-    }
-    entropy->restarts_to_go--;
-  }
-
-  return TRUE;
-}
-
-
-/*
- * MCU encoding for AC initial scan (either spectral selection,
- * or first pass of successive approximation).
- */
-
-METHODDEF(boolean)
-encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
-{
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  register int temp, temp2;
-  register int nbits;
-  register int r, k;
-  int Se = cinfo->Se;
-  int Al = cinfo->Al;
-  JBLOCKROW block;
-
-  entropy->next_output_byte = cinfo->dest->next_output_byte;
-  entropy->free_in_buffer = cinfo->dest->free_in_buffer;
-
-  /* Emit restart marker if needed */
-  if (cinfo->restart_interval)
-    if (entropy->restarts_to_go == 0)
-      emit_restart(entropy, entropy->next_restart_num);
-
-  /* Encode the MCU data block */
-  block = MCU_data[0];
-
-  /* Encode the AC coefficients per section G.1.2.2, fig. G.3 */
-  
-  r = 0;			/* r = run length of zeros */
-   
-  for (k = cinfo->Ss; k <= Se; k++) {
-    if ((temp = (*block)[jpeg_natural_order[k]]) == 0) {
-      r++;
-      continue;
-    }
-    /* We must apply the point transform by Al.  For AC coefficients this
-     * is an integer division with rounding towards 0.  To do this portably
-     * in C, we shift after obtaining the absolute value; so the code is
-     * interwoven with finding the abs value (temp) and output bits (temp2).
-     */
-    if (temp < 0) {
-      temp = -temp;		/* temp is abs value of input */
-      temp >>= Al;		/* apply the point transform */
-      /* For a negative coef, want temp2 = bitwise complement of abs(coef) */
-      temp2 = ~temp;
-    } else {
-      temp >>= Al;		/* apply the point transform */
-      temp2 = temp;
-    }
-    /* Watch out for case that nonzero coef is zero after point transform */
-    if (temp == 0) {
-      r++;
-      continue;
-    }
-
-    /* Emit any pending EOBRUN */
-    if (entropy->EOBRUN > 0)
-      emit_eobrun(entropy);
-    /* if run length > 15, must emit special run-length-16 codes (0xF0) */
-    while (r > 15) {
-      emit_symbol(entropy, entropy->ac_tbl_no, 0xF0);
-      r -= 16;
-    }
-
-    /* Find the number of bits needed for the magnitude of the coefficient */
-    nbits = 1;			/* there must be at least one 1 bit */
-    while ((temp >>= 1))
-      nbits++;
-    /* Check for out-of-range coefficient values */
-    if (nbits > MAX_COEF_BITS)
-      ERREXIT(cinfo, JERR_BAD_DCT_COEF);
-
-    /* Count/emit Huffman symbol for run length / number of bits */
-    emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + nbits);
-
-    /* Emit that number of bits of the value, if positive, */
-    /* or the complement of its magnitude, if negative. */
-    emit_bits(entropy, (unsigned int) temp2, nbits);
-
-    r = 0;			/* reset zero run length */
-  }
-
-  if (r > 0) {			/* If there are trailing zeroes, */
-    entropy->EOBRUN++;		/* count an EOB */
-    if (entropy->EOBRUN == 0x7FFF)
-      emit_eobrun(entropy);	/* force it out to avoid overflow */
-  }
-
-  cinfo->dest->next_output_byte = entropy->next_output_byte;
-  cinfo->dest->free_in_buffer = entropy->free_in_buffer;
-
-  /* Update restart-interval state too */
-  if (cinfo->restart_interval) {
-    if (entropy->restarts_to_go == 0) {
-      entropy->restarts_to_go = cinfo->restart_interval;
-      entropy->next_restart_num++;
-      entropy->next_restart_num &= 7;
-    }
-    entropy->restarts_to_go--;
-  }
-
-  return TRUE;
-}
-
-
-/*
- * MCU encoding for DC successive approximation refinement scan.
- * Note: we assume such scans can be multi-component, although the spec
- * is not very clear on the point.
- */
-
-METHODDEF(boolean)
-encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
-{
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  register int temp;
-  int blkn;
-  int Al = cinfo->Al;
-  JBLOCKROW block;
-
-  entropy->next_output_byte = cinfo->dest->next_output_byte;
-  entropy->free_in_buffer = cinfo->dest->free_in_buffer;
-
-  /* Emit restart marker if needed */
-  if (cinfo->restart_interval)
-    if (entropy->restarts_to_go == 0)
-      emit_restart(entropy, entropy->next_restart_num);
-
-  /* Encode the MCU data blocks */
-  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
-    block = MCU_data[blkn];
-
-    /* We simply emit the Al'th bit of the DC coefficient value. */
-    temp = (*block)[0];
-    emit_bits(entropy, (unsigned int) (temp >> Al), 1);
-  }
-
-  cinfo->dest->next_output_byte = entropy->next_output_byte;
-  cinfo->dest->free_in_buffer = entropy->free_in_buffer;
-
-  /* Update restart-interval state too */
-  if (cinfo->restart_interval) {
-    if (entropy->restarts_to_go == 0) {
-      entropy->restarts_to_go = cinfo->restart_interval;
-      entropy->next_restart_num++;
-      entropy->next_restart_num &= 7;
-    }
-    entropy->restarts_to_go--;
-  }
-
-  return TRUE;
-}
-
-
-/*
- * MCU encoding for AC successive approximation refinement scan.
- */
-
-METHODDEF(boolean)
-encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
-{
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  register int temp;
-  register int r, k;
-  int EOB;
-  char *BR_buffer;
-  unsigned int BR;
-  int Se = cinfo->Se;
-  int Al = cinfo->Al;
-  JBLOCKROW block;
-  int absvalues[DCTSIZE2];
-
-  entropy->next_output_byte = cinfo->dest->next_output_byte;
-  entropy->free_in_buffer = cinfo->dest->free_in_buffer;
-
-  /* Emit restart marker if needed */
-  if (cinfo->restart_interval)
-    if (entropy->restarts_to_go == 0)
-      emit_restart(entropy, entropy->next_restart_num);
-
-  /* Encode the MCU data block */
-  block = MCU_data[0];
-
-  /* It is convenient to make a pre-pass to determine the transformed
-   * coefficients' absolute values and the EOB position.
-   */
-  EOB = 0;
-  for (k = cinfo->Ss; k <= Se; k++) {
-    temp = (*block)[jpeg_natural_order[k]];
-    /* We must apply the point transform by Al.  For AC coefficients this
-     * is an integer division with rounding towards 0.  To do this portably
-     * in C, we shift after obtaining the absolute value.
-     */
-    if (temp < 0)
-      temp = -temp;		/* temp is abs value of input */
-    temp >>= Al;		/* apply the point transform */
-    absvalues[k] = temp;	/* save abs value for main pass */
-    if (temp == 1)
-      EOB = k;			/* EOB = index of last newly-nonzero coef */
-  }
-
-  /* Encode the AC coefficients per section G.1.2.3, fig. G.7 */
-  
-  r = 0;			/* r = run length of zeros */
-  BR = 0;			/* BR = count of buffered bits added now */
-  BR_buffer = entropy->bit_buffer + entropy->BE; /* Append bits to buffer */
-
-  for (k = cinfo->Ss; k <= Se; k++) {
-    if ((temp = absvalues[k]) == 0) {
-      r++;
-      continue;
-    }
-
-    /* Emit any required ZRLs, but not if they can be folded into EOB */
-    while (r > 15 && k <= EOB) {
-      /* emit any pending EOBRUN and the BE correction bits */
-      emit_eobrun(entropy);
-      /* Emit ZRL */
-      emit_symbol(entropy, entropy->ac_tbl_no, 0xF0);
-      r -= 16;
-      /* Emit buffered correction bits that must be associated with ZRL */
-      emit_buffered_bits(entropy, BR_buffer, BR);
-      BR_buffer = entropy->bit_buffer; /* BE bits are gone now */
-      BR = 0;
-    }
-
-    /* If the coef was previously nonzero, it only needs a correction bit.
-     * NOTE: a straight translation of the spec's figure G.7 would suggest
-     * that we also need to test r > 15.  But if r > 15, we can only get here
-     * if k > EOB, which implies that this coefficient is not 1.
-     */
-    if (temp > 1) {
-      /* The correction bit is the next bit of the absolute value. */
-      BR_buffer[BR++] = (char) (temp & 1);
-      continue;
-    }
-
-    /* Emit any pending EOBRUN and the BE correction bits */
-    emit_eobrun(entropy);
-
-    /* Count/emit Huffman symbol for run length / number of bits */
-    emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + 1);
-
-    /* Emit output bit for newly-nonzero coef */
-    temp = ((*block)[jpeg_natural_order[k]] < 0) ? 0 : 1;
-    emit_bits(entropy, (unsigned int) temp, 1);
-
-    /* Emit buffered correction bits that must be associated with this code */
-    emit_buffered_bits(entropy, BR_buffer, BR);
-    BR_buffer = entropy->bit_buffer; /* BE bits are gone now */
-    BR = 0;
-    r = 0;			/* reset zero run length */
-  }
-
-  if (r > 0 || BR > 0) {	/* If there are trailing zeroes, */
-    entropy->EOBRUN++;		/* count an EOB */
-    entropy->BE += BR;		/* concat my correction bits to older ones */
-    /* We force out the EOB if we risk either:
-     * 1. overflow of the EOB counter;
-     * 2. overflow of the correction bit buffer during the next MCU.
-     */
-    if (entropy->EOBRUN == 0x7FFF || entropy->BE > (MAX_CORR_BITS-DCTSIZE2+1))
-      emit_eobrun(entropy);
-  }
-
-  cinfo->dest->next_output_byte = entropy->next_output_byte;
-  cinfo->dest->free_in_buffer = entropy->free_in_buffer;
-
-  /* Update restart-interval state too */
-  if (cinfo->restart_interval) {
-    if (entropy->restarts_to_go == 0) {
-      entropy->restarts_to_go = cinfo->restart_interval;
-      entropy->next_restart_num++;
-      entropy->next_restart_num &= 7;
-    }
-    entropy->restarts_to_go--;
-  }
-
-  return TRUE;
-}
-
-
-/*
- * Finish up at the end of a Huffman-compressed progressive scan.
- */
-
-METHODDEF(void)
-finish_pass_phuff (j_compress_ptr cinfo)
-{   
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-
-  entropy->next_output_byte = cinfo->dest->next_output_byte;
-  entropy->free_in_buffer = cinfo->dest->free_in_buffer;
-
-  /* Flush out any buffered data */
-  emit_eobrun(entropy);
-  flush_bits(entropy);
-
-  cinfo->dest->next_output_byte = entropy->next_output_byte;
-  cinfo->dest->free_in_buffer = entropy->free_in_buffer;
-}
-
-
-/*
- * Finish up a statistics-gathering pass and create the new Huffman tables.
- */
-
-METHODDEF(void)
-finish_pass_gather_phuff (j_compress_ptr cinfo)
-{
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  boolean is_DC_band;
-  int ci, tbl;
-  jpeg_component_info * compptr;
-  JHUFF_TBL **htblptr;
-  boolean did[NUM_HUFF_TBLS];
-
-  /* Flush out buffered data (all we care about is counting the EOB symbol) */
-  emit_eobrun(entropy);
-
-  is_DC_band = (cinfo->Ss == 0);
-
-  /* It's important not to apply jpeg_gen_optimal_table more than once
-   * per table, because it clobbers the input frequency counts!
-   */
-  MEMZERO(did, SIZEOF(did));
-
-  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
-    compptr = cinfo->cur_comp_info[ci];
-    if (is_DC_band) {
-      if (cinfo->Ah != 0)	/* DC refinement needs no table */
-	continue;
-      tbl = compptr->dc_tbl_no;
-    } else {
-      tbl = compptr->ac_tbl_no;
-    }
-    if (! did[tbl]) {
-      if (is_DC_band)
-        htblptr = & cinfo->dc_huff_tbl_ptrs[tbl];
-      else
-        htblptr = & cinfo->ac_huff_tbl_ptrs[tbl];
-      if (*htblptr == NULL)
-        *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
-      jpeg_gen_optimal_table(cinfo, *htblptr, entropy->count_ptrs[tbl]);
-      did[tbl] = TRUE;
-    }
-  }
-}
-
-
-/*
- * Module initialization routine for progressive Huffman entropy encoding.
- */
-
-GLOBAL(void)
-jinit_phuff_encoder (j_compress_ptr cinfo)
-{
-  phuff_entropy_ptr entropy;
-  int i;
-
-  entropy = (phuff_entropy_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(phuff_entropy_encoder));
-  cinfo->entropy = (struct jpeg_entropy_encoder *) entropy;
-  entropy->pub.start_pass = start_pass_phuff;
-
-  /* Mark tables unallocated */
-  for (i = 0; i < NUM_HUFF_TBLS; i++) {
-    entropy->derived_tbls[i] = NULL;
-    entropy->count_ptrs[i] = NULL;
-  }
-  entropy->bit_buffer = NULL;	/* needed only in AC refinement scan */
-}
-
-#endif /* C_PROGRESSIVE_SUPPORTED */
diff --git a/3rdparty/libjpeg/jcprepct.c b/3rdparty/libjpeg/jcprepct.c
index fa93333db2..be44cc4b45 100644
--- a/3rdparty/libjpeg/jcprepct.c
+++ b/3rdparty/libjpeg/jcprepct.c
@@ -173,10 +173,12 @@ pre_process_data (j_compress_ptr cinfo,
 	*out_row_group_ctr < out_row_groups_avail) {
       for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
 	   ci++, compptr++) {
+	numrows = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
+		  cinfo->min_DCT_v_scaled_size;
 	expand_bottom_edge(output_buf[ci],
-			   compptr->width_in_blocks * DCTSIZE,
-			   (int) (*out_row_group_ctr * compptr->v_samp_factor),
-			   (int) (out_row_groups_avail * compptr->v_samp_factor));
+			   compptr->width_in_blocks * compptr->DCT_h_scaled_size,
+			   (int) (*out_row_group_ctr * numrows),
+			   (int) (out_row_groups_avail * numrows));
       }
       *out_row_group_ctr = out_row_groups_avail;
       break;			/* can exit outer loop without test */
@@ -288,7 +290,8 @@ create_context_buffer (j_compress_ptr cinfo)
      */
     true_buffer = (*cinfo->mem->alloc_sarray)
       ((j_common_ptr) cinfo, JPOOL_IMAGE,
-       (JDIMENSION) (((long) compptr->width_in_blocks * DCTSIZE *
+       (JDIMENSION) (((long) compptr->width_in_blocks *
+		      cinfo->min_DCT_h_scaled_size *
 		      cinfo->max_h_samp_factor) / compptr->h_samp_factor),
        (JDIMENSION) (3 * rgroup_height));
     /* Copy true buffer row pointers into the middle of the fake row array */
@@ -346,7 +349,8 @@ jinit_c_prep_controller (j_compress_ptr cinfo, boolean need_full_buffer)
 	 ci++, compptr++) {
       prep->color_buf[ci] = (*cinfo->mem->alloc_sarray)
 	((j_common_ptr) cinfo, JPOOL_IMAGE,
-	 (JDIMENSION) (((long) compptr->width_in_blocks * DCTSIZE *
+	 (JDIMENSION) (((long) compptr->width_in_blocks *
+			cinfo->min_DCT_h_scaled_size *
 			cinfo->max_h_samp_factor) / compptr->h_samp_factor),
 	 (JDIMENSION) cinfo->max_v_samp_factor);
     }
diff --git a/3rdparty/libjpeg/jcsample.c b/3rdparty/libjpeg/jcsample.c
index 212ec8757c..4d36f85f35 100644
--- a/3rdparty/libjpeg/jcsample.c
+++ b/3rdparty/libjpeg/jcsample.c
@@ -62,6 +62,15 @@ typedef struct {
 
   /* Downsampling method pointers, one per component */
   downsample1_ptr methods[MAX_COMPONENTS];
+
+  /* Height of an output row group for each component. */
+  int rowgroup_height[MAX_COMPONENTS];
+
+  /* These arrays save pixel expansion factors so that int_downsample need not
+   * recompute them each time.  They are unused for other downsampling methods.
+   */
+  UINT8 h_expand[MAX_COMPONENTS];
+  UINT8 v_expand[MAX_COMPONENTS];
 } my_downsampler;
 
 typedef my_downsampler * my_downsample_ptr;
@@ -123,7 +132,8 @@ sep_downsample (j_compress_ptr cinfo,
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
     in_ptr = input_buf[ci] + in_row_index;
-    out_ptr = output_buf[ci] + (out_row_group_index * compptr->v_samp_factor);
+    out_ptr = output_buf[ci] +
+	      (out_row_group_index * downsample->rowgroup_height[ci]);
     (*downsample->methods[ci]) (cinfo, compptr, in_ptr, out_ptr);
   }
 }
@@ -140,14 +150,15 @@ METHODDEF(void)
 int_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
 		JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
+  my_downsample_ptr downsample = (my_downsample_ptr) cinfo->downsample;
   int inrow, outrow, h_expand, v_expand, numpix, numpix2, h, v;
   JDIMENSION outcol, outcol_h;	/* outcol_h == outcol*h_expand */
-  JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
+  JDIMENSION output_cols = compptr->width_in_blocks * compptr->DCT_h_scaled_size;
   JSAMPROW inptr, outptr;
   INT32 outvalue;
 
-  h_expand = cinfo->max_h_samp_factor / compptr->h_samp_factor;
-  v_expand = cinfo->max_v_samp_factor / compptr->v_samp_factor;
+  h_expand = downsample->h_expand[compptr->component_index];
+  v_expand = downsample->v_expand[compptr->component_index];
   numpix = h_expand * v_expand;
   numpix2 = numpix/2;
 
@@ -158,8 +169,8 @@ int_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
   expand_right_edge(input_data, cinfo->max_v_samp_factor,
 		    cinfo->image_width, output_cols * h_expand);
 
-  inrow = 0;
-  for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
+  inrow = outrow = 0;
+  while (inrow < cinfo->max_v_samp_factor) {
     outptr = output_data[outrow];
     for (outcol = 0, outcol_h = 0; outcol < output_cols;
 	 outcol++, outcol_h += h_expand) {
@@ -173,6 +184,7 @@ int_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
       *outptr++ = (JSAMPLE) ((outvalue + numpix2) / numpix);
     }
     inrow += v_expand;
+    outrow++;
   }
 }
 
@@ -191,8 +203,8 @@ fullsize_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
   jcopy_sample_rows(input_data, 0, output_data, 0,
 		    cinfo->max_v_samp_factor, cinfo->image_width);
   /* Edge-expand */
-  expand_right_edge(output_data, cinfo->max_v_samp_factor,
-		    cinfo->image_width, compptr->width_in_blocks * DCTSIZE);
+  expand_right_edge(output_data, cinfo->max_v_samp_factor, cinfo->image_width,
+		    compptr->width_in_blocks * compptr->DCT_h_scaled_size);
 }
 
 
@@ -212,9 +224,9 @@ METHODDEF(void)
 h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
 		 JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
-  int outrow;
+  int inrow;
   JDIMENSION outcol;
-  JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
+  JDIMENSION output_cols = compptr->width_in_blocks * compptr->DCT_h_scaled_size;
   register JSAMPROW inptr, outptr;
   register int bias;
 
@@ -225,9 +237,9 @@ h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
   expand_right_edge(input_data, cinfo->max_v_samp_factor,
 		    cinfo->image_width, output_cols * 2);
 
-  for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
-    outptr = output_data[outrow];
-    inptr = input_data[outrow];
+  for (inrow = 0; inrow < cinfo->max_v_samp_factor; inrow++) {
+    outptr = output_data[inrow];
+    inptr = input_data[inrow];
     bias = 0;			/* bias = 0,1,0,1,... for successive samples */
     for (outcol = 0; outcol < output_cols; outcol++) {
       *outptr++ = (JSAMPLE) ((GETJSAMPLE(*inptr) + GETJSAMPLE(inptr[1])
@@ -251,7 +263,7 @@ h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
 {
   int inrow, outrow;
   JDIMENSION outcol;
-  JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
+  JDIMENSION output_cols = compptr->width_in_blocks * compptr->DCT_h_scaled_size;
   register JSAMPROW inptr0, inptr1, outptr;
   register int bias;
 
@@ -262,8 +274,8 @@ h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
   expand_right_edge(input_data, cinfo->max_v_samp_factor,
 		    cinfo->image_width, output_cols * 2);
 
-  inrow = 0;
-  for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
+  inrow = outrow = 0;
+  while (inrow < cinfo->max_v_samp_factor) {
     outptr = output_data[outrow];
     inptr0 = input_data[inrow];
     inptr1 = input_data[inrow+1];
@@ -276,6 +288,7 @@ h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
       inptr0 += 2; inptr1 += 2;
     }
     inrow += 2;
+    outrow++;
   }
 }
 
@@ -294,7 +307,7 @@ h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
 {
   int inrow, outrow;
   JDIMENSION colctr;
-  JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
+  JDIMENSION output_cols = compptr->width_in_blocks * compptr->DCT_h_scaled_size;
   register JSAMPROW inptr0, inptr1, above_ptr, below_ptr, outptr;
   INT32 membersum, neighsum, memberscale, neighscale;
 
@@ -321,8 +334,8 @@ h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
   memberscale = 16384 - cinfo->smoothing_factor * 80; /* scaled (1-5*SF)/4 */
   neighscale = cinfo->smoothing_factor * 16; /* scaled SF/4 */
 
-  inrow = 0;
-  for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
+  inrow = outrow = 0;
+  while (inrow < cinfo->max_v_samp_factor) {
     outptr = output_data[outrow];
     inptr0 = input_data[inrow];
     inptr1 = input_data[inrow+1];
@@ -378,6 +391,7 @@ h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
     *outptr = (JSAMPLE) ((membersum + 32768) >> 16);
 
     inrow += 2;
+    outrow++;
   }
 }
 
@@ -392,9 +406,9 @@ METHODDEF(void)
 fullsize_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
 			    JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
-  int outrow;
+  int inrow;
   JDIMENSION colctr;
-  JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
+  JDIMENSION output_cols = compptr->width_in_blocks * compptr->DCT_h_scaled_size;
   register JSAMPROW inptr, above_ptr, below_ptr, outptr;
   INT32 membersum, neighsum, memberscale, neighscale;
   int colsum, lastcolsum, nextcolsum;
@@ -415,11 +429,11 @@ fullsize_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
   memberscale = 65536L - cinfo->smoothing_factor * 512L; /* scaled 1-8*SF */
   neighscale = cinfo->smoothing_factor * 64; /* scaled SF */
 
-  for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
-    outptr = output_data[outrow];
-    inptr = input_data[outrow];
-    above_ptr = input_data[outrow-1];
-    below_ptr = input_data[outrow+1];
+  for (inrow = 0; inrow < cinfo->max_v_samp_factor; inrow++) {
+    outptr = output_data[inrow];
+    inptr = input_data[inrow];
+    above_ptr = input_data[inrow-1];
+    below_ptr = input_data[inrow+1];
 
     /* Special case for first column */
     colsum = GETJSAMPLE(*above_ptr++) + GETJSAMPLE(*below_ptr++) +
@@ -467,6 +481,7 @@ jinit_downsampler (j_compress_ptr cinfo)
   int ci;
   jpeg_component_info * compptr;
   boolean smoothok = TRUE;
+  int h_in_group, v_in_group, h_out_group, v_out_group;
 
   downsample = (my_downsample_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
@@ -482,8 +497,17 @@ jinit_downsampler (j_compress_ptr cinfo)
   /* Verify we can handle the sampling factors, and set up method pointers */
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
-    if (compptr->h_samp_factor == cinfo->max_h_samp_factor &&
-	compptr->v_samp_factor == cinfo->max_v_samp_factor) {
+    /* Compute size of an "output group" for DCT scaling.  This many samples
+     * are to be converted from max_h_samp_factor * max_v_samp_factor pixels.
+     */
+    h_out_group = (compptr->h_samp_factor * compptr->DCT_h_scaled_size) /
+		  cinfo->min_DCT_h_scaled_size;
+    v_out_group = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
+		  cinfo->min_DCT_v_scaled_size;
+    h_in_group = cinfo->max_h_samp_factor;
+    v_in_group = cinfo->max_v_samp_factor;
+    downsample->rowgroup_height[ci] = v_out_group; /* save for use later */
+    if (h_in_group == h_out_group && v_in_group == v_out_group) {
 #ifdef INPUT_SMOOTHING_SUPPORTED
       if (cinfo->smoothing_factor) {
 	downsample->methods[ci] = fullsize_smooth_downsample;
@@ -491,12 +515,12 @@ jinit_downsampler (j_compress_ptr cinfo)
       } else
 #endif
 	downsample->methods[ci] = fullsize_downsample;
-    } else if (compptr->h_samp_factor * 2 == cinfo->max_h_samp_factor &&
-	       compptr->v_samp_factor == cinfo->max_v_samp_factor) {
+    } else if (h_in_group == h_out_group * 2 &&
+	       v_in_group == v_out_group) {
       smoothok = FALSE;
       downsample->methods[ci] = h2v1_downsample;
-    } else if (compptr->h_samp_factor * 2 == cinfo->max_h_samp_factor &&
-	       compptr->v_samp_factor * 2 == cinfo->max_v_samp_factor) {
+    } else if (h_in_group == h_out_group * 2 &&
+	       v_in_group == v_out_group * 2) {
 #ifdef INPUT_SMOOTHING_SUPPORTED
       if (cinfo->smoothing_factor) {
 	downsample->methods[ci] = h2v2_smooth_downsample;
@@ -504,10 +528,12 @@ jinit_downsampler (j_compress_ptr cinfo)
       } else
 #endif
 	downsample->methods[ci] = h2v2_downsample;
-    } else if ((cinfo->max_h_samp_factor % compptr->h_samp_factor) == 0 &&
-	       (cinfo->max_v_samp_factor % compptr->v_samp_factor) == 0) {
+    } else if ((h_in_group % h_out_group) == 0 &&
+	       (v_in_group % v_out_group) == 0) {
       smoothok = FALSE;
       downsample->methods[ci] = int_downsample;
+      downsample->h_expand[ci] = (UINT8) (h_in_group / h_out_group);
+      downsample->v_expand[ci] = (UINT8) (v_in_group / v_out_group);
     } else
       ERREXIT(cinfo, JERR_FRACT_SAMPLE_NOTIMPL);
   }
diff --git a/3rdparty/libjpeg/jctrans.c b/3rdparty/libjpeg/jctrans.c
index 0e6d70769d..f7d7b81491 100644
--- a/3rdparty/libjpeg/jctrans.c
+++ b/3rdparty/libjpeg/jctrans.c
@@ -2,6 +2,7 @@
  * jctrans.c
  *
  * Copyright (C) 1995-1998, Thomas G. Lane.
+ * Modified 2000-2011 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -76,6 +77,10 @@ jpeg_copy_critical_parameters (j_decompress_ptr srcinfo,
   dstinfo->image_height = srcinfo->image_height;
   dstinfo->input_components = srcinfo->num_components;
   dstinfo->in_color_space = srcinfo->jpeg_color_space;
+  dstinfo->jpeg_width = srcinfo->output_width;
+  dstinfo->jpeg_height = srcinfo->output_height;
+  dstinfo->min_DCT_h_scaled_size = srcinfo->min_DCT_h_scaled_size;
+  dstinfo->min_DCT_v_scaled_size = srcinfo->min_DCT_v_scaled_size;
   /* Initialize all parameters to default values */
   jpeg_set_defaults(dstinfo);
   /* jpeg_set_defaults may choose wrong colorspace, eg YCbCr if input is RGB.
@@ -158,25 +163,14 @@ LOCAL(void)
 transencode_master_selection (j_compress_ptr cinfo,
 			      jvirt_barray_ptr * coef_arrays)
 {
-  /* Although we don't actually use input_components for transcoding,
-   * jcmaster.c's initial_setup will complain if input_components is 0.
-   */
-  cinfo->input_components = 1;
   /* Initialize master control (includes parameter checking/processing) */
   jinit_c_master_control(cinfo, TRUE /* transcode only */);
 
   /* Entropy encoding: either Huffman or arithmetic coding. */
-  if (cinfo->arith_code) {
-    ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
-  } else {
-    if (cinfo->progressive_mode) {
-#ifdef C_PROGRESSIVE_SUPPORTED
-      jinit_phuff_encoder(cinfo);
-#else
-      ERREXIT(cinfo, JERR_NOT_COMPILED);
-#endif
-    } else
-      jinit_huff_encoder(cinfo);
+  if (cinfo->arith_code)
+    jinit_arith_encoder(cinfo);
+  else {
+    jinit_huff_encoder(cinfo);
   }
 
   /* We need a special coefficient buffer controller. */
@@ -381,7 +375,7 @@ transencode_coef_controller (j_compress_ptr cinfo,
   buffer = (JBLOCKROW)
     (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
 				C_MAX_BLOCKS_IN_MCU * SIZEOF(JBLOCK));
-  jzero_far((void FAR *) buffer, C_MAX_BLOCKS_IN_MCU * SIZEOF(JBLOCK));
+  FMEMZERO((void FAR *) buffer, C_MAX_BLOCKS_IN_MCU * SIZEOF(JBLOCK));
   for (i = 0; i < C_MAX_BLOCKS_IN_MCU; i++) {
     coef->dummy_buffer[i] = buffer + i;
   }
diff --git a/3rdparty/libjpeg/jdapimin.c b/3rdparty/libjpeg/jdapimin.c
index cadb59fce3..7f1ce4c05b 100644
--- a/3rdparty/libjpeg/jdapimin.c
+++ b/3rdparty/libjpeg/jdapimin.c
@@ -2,6 +2,7 @@
  * jdapimin.c
  *
  * Copyright (C) 1994-1998, Thomas G. Lane.
+ * Modified 2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -185,8 +186,8 @@ default_decompress_parms (j_decompress_ptr cinfo)
   }
 
   /* Set defaults for other decompression parameters. */
-  cinfo->scale_num = 1;		/* 1:1 scaling */
-  cinfo->scale_denom = 1;
+  cinfo->scale_num = cinfo->block_size;		/* 1:1 scaling */
+  cinfo->scale_denom = cinfo->block_size;
   cinfo->output_gamma = 1.0;
   cinfo->buffered_image = FALSE;
   cinfo->raw_data_out = FALSE;
diff --git a/3rdparty/libjpeg/jdapistd.c b/3rdparty/libjpeg/jdapistd.c
index c8e3fa0c35..9d74537772 100644
--- a/3rdparty/libjpeg/jdapistd.c
+++ b/3rdparty/libjpeg/jdapistd.c
@@ -202,7 +202,7 @@ jpeg_read_raw_data (j_decompress_ptr cinfo, JSAMPIMAGE data,
   }
 
   /* Verify that at least one iMCU row can be returned. */
-  lines_per_iMCU_row = cinfo->max_v_samp_factor * cinfo->min_DCT_scaled_size;
+  lines_per_iMCU_row = cinfo->max_v_samp_factor * cinfo->min_DCT_v_scaled_size;
   if (max_lines < lines_per_iMCU_row)
     ERREXIT(cinfo, JERR_BUFFER_SIZE);
 
diff --git a/3rdparty/libjpeg/jdarith.c b/3rdparty/libjpeg/jdarith.c
new file mode 100644
index 0000000000..092f8af5fe
--- /dev/null
+++ b/3rdparty/libjpeg/jdarith.c
@@ -0,0 +1,776 @@
+/*
+ * jdarith.c
+ *
+ * Developed 1997-2011 by Guido Vollbeding.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file contains portable arithmetic entropy decoding routines for JPEG
+ * (implementing the ISO/IEC IS 10918-1 and CCITT Recommendation ITU-T T.81).
+ *
+ * Both sequential and progressive modes are supported in this single module.
+ *
+ * Suspension is not currently supported in this module.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+
+/* Expanded entropy decoder object for arithmetic decoding. */
+
+typedef struct {
+  struct jpeg_entropy_decoder pub; /* public fields */
+
+  INT32 c;       /* C register, base of coding interval + input bit buffer */
+  INT32 a;               /* A register, normalized size of coding interval */
+  int ct;     /* bit shift counter, # of bits left in bit buffer part of C */
+                                                         /* init: ct = -16 */
+                                                         /* run: ct = 0..7 */
+                                                         /* error: ct = -1 */
+  int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
+  int dc_context[MAX_COMPS_IN_SCAN]; /* context index for DC conditioning */
+
+  unsigned int restarts_to_go;	/* MCUs left in this restart interval */
+
+  /* Pointers to statistics areas (these workspaces have image lifespan) */
+  unsigned char * dc_stats[NUM_ARITH_TBLS];
+  unsigned char * ac_stats[NUM_ARITH_TBLS];
+
+  /* Statistics bin for coding with fixed probability 0.5 */
+  unsigned char fixed_bin[4];
+} arith_entropy_decoder;
+
+typedef arith_entropy_decoder * arith_entropy_ptr;
+
+/* The following two definitions specify the allocation chunk size
+ * for the statistics area.
+ * According to sections F.1.4.4.1.3 and F.1.4.4.2, we need at least
+ * 49 statistics bins for DC, and 245 statistics bins for AC coding.
+ *
+ * We use a compact representation with 1 byte per statistics bin,
+ * thus the numbers directly represent byte sizes.
+ * This 1 byte per statistics bin contains the meaning of the MPS
+ * (more probable symbol) in the highest bit (mask 0x80), and the
+ * index into the probability estimation state machine table
+ * in the lower bits (mask 0x7F).
+ */
+
+#define DC_STAT_BINS 64
+#define AC_STAT_BINS 256
+
+
+LOCAL(int)
+get_byte (j_decompress_ptr cinfo)
+/* Read next input byte; we do not support suspension in this module. */
+{
+  struct jpeg_source_mgr * src = cinfo->src;
+
+  if (src->bytes_in_buffer == 0)
+    if (! (*src->fill_input_buffer) (cinfo))
+      ERREXIT(cinfo, JERR_CANT_SUSPEND);
+  src->bytes_in_buffer--;
+  return GETJOCTET(*src->next_input_byte++);
+}
+
+
+/*
+ * The core arithmetic decoding routine (common in JPEG and JBIG).
+ * This needs to go as fast as possible.
+ * Machine-dependent optimization facilities
+ * are not utilized in this portable implementation.
+ * However, this code should be fairly efficient and
+ * may be a good base for further optimizations anyway.
+ *
+ * Return value is 0 or 1 (binary decision).
+ *
+ * Note: I've changed the handling of the code base & bit
+ * buffer register C compared to other implementations
+ * based on the standards layout & procedures.
+ * While it also contains both the actual base of the
+ * coding interval (16 bits) and the next-bits buffer,
+ * the cut-point between these two parts is floating
+ * (instead of fixed) with the bit shift counter CT.
+ * Thus, we also need only one (variable instead of
+ * fixed size) shift for the LPS/MPS decision, and
+ * we can get away with any renormalization update
+ * of C (except for new data insertion, of course).
+ *
+ * I've also introduced a new scheme for accessing
+ * the probability estimation state machine table,
+ * derived from Markus Kuhn's JBIG implementation.
+ */
+
+LOCAL(int)
+arith_decode (j_decompress_ptr cinfo, unsigned char *st)
+{
+  register arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy;
+  register unsigned char nl, nm;
+  register INT32 qe, temp;
+  register int sv, data;
+
+  /* Renormalization & data input per section D.2.6 */
+  while (e->a < 0x8000L) {
+    if (--e->ct < 0) {
+      /* Need to fetch next data byte */
+      if (cinfo->unread_marker)
+	data = 0;		/* stuff zero data */
+      else {
+	data = get_byte(cinfo);	/* read next input byte */
+	if (data == 0xFF) {	/* zero stuff or marker code */
+	  do data = get_byte(cinfo);
+	  while (data == 0xFF);	/* swallow extra 0xFF bytes */
+	  if (data == 0)
+	    data = 0xFF;	/* discard stuffed zero byte */
+	  else {
+	    /* Note: Different from the Huffman decoder, hitting
+	     * a marker while processing the compressed data
+	     * segment is legal in arithmetic coding.
+	     * The convention is to supply zero data
+	     * then until decoding is complete.
+	     */
+	    cinfo->unread_marker = data;
+	    data = 0;
+	  }
+	}
+      }
+      e->c = (e->c << 8) | data; /* insert data into C register */
+      if ((e->ct += 8) < 0)	 /* update bit shift counter */
+	/* Need more initial bytes */
+	if (++e->ct == 0)
+	  /* Got 2 initial bytes -> re-init A and exit loop */
+	  e->a = 0x8000L; /* => e->a = 0x10000L after loop exit */
+    }
+    e->a <<= 1;
+  }
+
+  /* Fetch values from our compact representation of Table D.3(D.2):
+   * Qe values and probability estimation state machine
+   */
+  sv = *st;
+  qe = jpeg_aritab[sv & 0x7F];	/* => Qe_Value */
+  nl = qe & 0xFF; qe >>= 8;	/* Next_Index_LPS + Switch_MPS */
+  nm = qe & 0xFF; qe >>= 8;	/* Next_Index_MPS */
+
+  /* Decode & estimation procedures per sections D.2.4 & D.2.5 */
+  temp = e->a - qe;
+  e->a = temp;
+  temp <<= e->ct;
+  if (e->c >= temp) {
+    e->c -= temp;
+    /* Conditional LPS (less probable symbol) exchange */
+    if (e->a < qe) {
+      e->a = qe;
+      *st = (sv & 0x80) ^ nm;	/* Estimate_after_MPS */
+    } else {
+      e->a = qe;
+      *st = (sv & 0x80) ^ nl;	/* Estimate_after_LPS */
+      sv ^= 0x80;		/* Exchange LPS/MPS */
+    }
+  } else if (e->a < 0x8000L) {
+    /* Conditional MPS (more probable symbol) exchange */
+    if (e->a < qe) {
+      *st = (sv & 0x80) ^ nl;	/* Estimate_after_LPS */
+      sv ^= 0x80;		/* Exchange LPS/MPS */
+    } else {
+      *st = (sv & 0x80) ^ nm;	/* Estimate_after_MPS */
+    }
+  }
+
+  return sv >> 7;
+}
+
+
+/*
+ * Check for a restart marker & resynchronize decoder.
+ */
+
+LOCAL(void)
+process_restart (j_decompress_ptr cinfo)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  int ci;
+  jpeg_component_info * compptr;
+
+  /* Advance past the RSTn marker */
+  if (! (*cinfo->marker->read_restart_marker) (cinfo))
+    ERREXIT(cinfo, JERR_CANT_SUSPEND);
+
+  /* Re-initialize statistics areas */
+  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+    compptr = cinfo->cur_comp_info[ci];
+    if (! cinfo->progressive_mode || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
+      MEMZERO(entropy->dc_stats[compptr->dc_tbl_no], DC_STAT_BINS);
+      /* Reset DC predictions to 0 */
+      entropy->last_dc_val[ci] = 0;
+      entropy->dc_context[ci] = 0;
+    }
+    if ((! cinfo->progressive_mode && cinfo->lim_Se) ||
+	(cinfo->progressive_mode && cinfo->Ss)) {
+      MEMZERO(entropy->ac_stats[compptr->ac_tbl_no], AC_STAT_BINS);
+    }
+  }
+
+  /* Reset arithmetic decoding variables */
+  entropy->c = 0;
+  entropy->a = 0;
+  entropy->ct = -16;	/* force reading 2 initial bytes to fill C */
+
+  /* Reset restart counter */
+  entropy->restarts_to_go = cinfo->restart_interval;
+}
+
+
+/*
+ * Arithmetic MCU decoding.
+ * Each of these routines decodes and returns one MCU's worth of
+ * arithmetic-compressed coefficients.
+ * The coefficients are reordered from zigzag order into natural array order,
+ * but are not dequantized.
+ *
+ * The i'th block of the MCU is stored into the block pointed to by
+ * MCU_data[i].  WE ASSUME THIS AREA IS INITIALLY ZEROED BY THE CALLER.
+ */
+
+/*
+ * MCU decoding for DC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  JBLOCKROW block;
+  unsigned char *st;
+  int blkn, ci, tbl, sign;
+  int v, m;
+
+  /* Process restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      process_restart(cinfo);
+    entropy->restarts_to_go--;
+  }
+
+  if (entropy->ct == -1) return TRUE;	/* if error do nothing */
+
+  /* Outer loop handles each block in the MCU */
+
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    block = MCU_data[blkn];
+    ci = cinfo->MCU_membership[blkn];
+    tbl = cinfo->cur_comp_info[ci]->dc_tbl_no;
+
+    /* Sections F.2.4.1 & F.1.4.4.1: Decoding of DC coefficients */
+
+    /* Table F.4: Point to statistics bin S0 for DC coefficient coding */
+    st = entropy->dc_stats[tbl] + entropy->dc_context[ci];
+
+    /* Figure F.19: Decode_DC_DIFF */
+    if (arith_decode(cinfo, st) == 0)
+      entropy->dc_context[ci] = 0;
+    else {
+      /* Figure F.21: Decoding nonzero value v */
+      /* Figure F.22: Decoding the sign of v */
+      sign = arith_decode(cinfo, st + 1);
+      st += 2; st += sign;
+      /* Figure F.23: Decoding the magnitude category of v */
+      if ((m = arith_decode(cinfo, st)) != 0) {
+	st = entropy->dc_stats[tbl] + 20;	/* Table F.4: X1 = 20 */
+	while (arith_decode(cinfo, st)) {
+	  if ((m <<= 1) == 0x8000) {
+	    WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+	    entropy->ct = -1;			/* magnitude overflow */
+	    return TRUE;
+	  }
+	  st += 1;
+	}
+      }
+      /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
+      if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+	entropy->dc_context[ci] = 0;		   /* zero diff category */
+      else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+	entropy->dc_context[ci] = 12 + (sign * 4); /* large diff category */
+      else
+	entropy->dc_context[ci] = 4 + (sign * 4);  /* small diff category */
+      v = m;
+      /* Figure F.24: Decoding the magnitude bit pattern of v */
+      st += 14;
+      while (m >>= 1)
+	if (arith_decode(cinfo, st)) v |= m;
+      v += 1; if (sign) v = -v;
+      entropy->last_dc_val[ci] += v;
+    }
+
+    /* Scale and output the DC coefficient (assumes jpeg_natural_order[0]=0) */
+    (*block)[0] = (JCOEF) (entropy->last_dc_val[ci] << cinfo->Al);
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU decoding for AC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  JBLOCKROW block;
+  unsigned char *st;
+  int tbl, sign, k;
+  int v, m;
+  const int * natural_order;
+
+  /* Process restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      process_restart(cinfo);
+    entropy->restarts_to_go--;
+  }
+
+  if (entropy->ct == -1) return TRUE;	/* if error do nothing */
+
+  natural_order = cinfo->natural_order;
+
+  /* There is always only one block per MCU */
+  block = MCU_data[0];
+  tbl = cinfo->cur_comp_info[0]->ac_tbl_no;
+
+  /* Sections F.2.4.2 & F.1.4.4.2: Decoding of AC coefficients */
+
+  /* Figure F.20: Decode_AC_coefficients */
+  for (k = cinfo->Ss; k <= cinfo->Se; k++) {
+    st = entropy->ac_stats[tbl] + 3 * (k - 1);
+    if (arith_decode(cinfo, st)) break;		/* EOB flag */
+    while (arith_decode(cinfo, st + 1) == 0) {
+      st += 3; k++;
+      if (k > cinfo->Se) {
+	WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+	entropy->ct = -1;			/* spectral overflow */
+	return TRUE;
+      }
+    }
+    /* Figure F.21: Decoding nonzero value v */
+    /* Figure F.22: Decoding the sign of v */
+    sign = arith_decode(cinfo, entropy->fixed_bin);
+    st += 2;
+    /* Figure F.23: Decoding the magnitude category of v */
+    if ((m = arith_decode(cinfo, st)) != 0) {
+      if (arith_decode(cinfo, st)) {
+	m <<= 1;
+	st = entropy->ac_stats[tbl] +
+	     (k <= cinfo->arith_ac_K[tbl] ? 189 : 217);
+	while (arith_decode(cinfo, st)) {
+	  if ((m <<= 1) == 0x8000) {
+	    WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+	    entropy->ct = -1;			/* magnitude overflow */
+	    return TRUE;
+	  }
+	  st += 1;
+	}
+      }
+    }
+    v = m;
+    /* Figure F.24: Decoding the magnitude bit pattern of v */
+    st += 14;
+    while (m >>= 1)
+      if (arith_decode(cinfo, st)) v |= m;
+    v += 1; if (sign) v = -v;
+    /* Scale and output coefficient in natural (dezigzagged) order */
+    (*block)[natural_order[k]] = (JCOEF) (v << cinfo->Al);
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU decoding for DC successive approximation refinement scan.
+ */
+
+METHODDEF(boolean)
+decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  unsigned char *st;
+  int p1, blkn;
+
+  /* Process restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      process_restart(cinfo);
+    entropy->restarts_to_go--;
+  }
+
+  st = entropy->fixed_bin;	/* use fixed probability estimation */
+  p1 = 1 << cinfo->Al;		/* 1 in the bit position being coded */
+
+  /* Outer loop handles each block in the MCU */
+
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    /* Encoded data is simply the next bit of the two's-complement DC value */
+    if (arith_decode(cinfo, st))
+      MCU_data[blkn][0][0] |= p1;
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU decoding for AC successive approximation refinement scan.
+ */
+
+METHODDEF(boolean)
+decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  JBLOCKROW block;
+  JCOEFPTR thiscoef;
+  unsigned char *st;
+  int tbl, k, kex;
+  int p1, m1;
+  const int * natural_order;
+
+  /* Process restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      process_restart(cinfo);
+    entropy->restarts_to_go--;
+  }
+
+  if (entropy->ct == -1) return TRUE;	/* if error do nothing */
+
+  natural_order = cinfo->natural_order;
+
+  /* There is always only one block per MCU */
+  block = MCU_data[0];
+  tbl = cinfo->cur_comp_info[0]->ac_tbl_no;
+
+  p1 = 1 << cinfo->Al;		/* 1 in the bit position being coded */
+  m1 = (-1) << cinfo->Al;	/* -1 in the bit position being coded */
+
+  /* Establish EOBx (previous stage end-of-block) index */
+  for (kex = cinfo->Se; kex > 0; kex--)
+    if ((*block)[natural_order[kex]]) break;
+
+  for (k = cinfo->Ss; k <= cinfo->Se; k++) {
+    st = entropy->ac_stats[tbl] + 3 * (k - 1);
+    if (k > kex)
+      if (arith_decode(cinfo, st)) break;	/* EOB flag */
+    for (;;) {
+      thiscoef = *block + natural_order[k];
+      if (*thiscoef) {				/* previously nonzero coef */
+	if (arith_decode(cinfo, st + 2)) {
+	  if (*thiscoef < 0)
+	    *thiscoef += m1;
+	  else
+	    *thiscoef += p1;
+	}
+	break;
+      }
+      if (arith_decode(cinfo, st + 1)) {	/* newly nonzero coef */
+	if (arith_decode(cinfo, entropy->fixed_bin))
+	  *thiscoef = m1;
+	else
+	  *thiscoef = p1;
+	break;
+      }
+      st += 3; k++;
+      if (k > cinfo->Se) {
+	WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+	entropy->ct = -1;			/* spectral overflow */
+	return TRUE;
+      }
+    }
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * Decode one MCU's worth of arithmetic-compressed coefficients.
+ */
+
+METHODDEF(boolean)
+decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  jpeg_component_info * compptr;
+  JBLOCKROW block;
+  unsigned char *st;
+  int blkn, ci, tbl, sign, k;
+  int v, m;
+  const int * natural_order;
+
+  /* Process restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      process_restart(cinfo);
+    entropy->restarts_to_go--;
+  }
+
+  if (entropy->ct == -1) return TRUE;	/* if error do nothing */
+
+  natural_order = cinfo->natural_order;
+
+  /* Outer loop handles each block in the MCU */
+
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    block = MCU_data[blkn];
+    ci = cinfo->MCU_membership[blkn];
+    compptr = cinfo->cur_comp_info[ci];
+
+    /* Sections F.2.4.1 & F.1.4.4.1: Decoding of DC coefficients */
+
+    tbl = compptr->dc_tbl_no;
+
+    /* Table F.4: Point to statistics bin S0 for DC coefficient coding */
+    st = entropy->dc_stats[tbl] + entropy->dc_context[ci];
+
+    /* Figure F.19: Decode_DC_DIFF */
+    if (arith_decode(cinfo, st) == 0)
+      entropy->dc_context[ci] = 0;
+    else {
+      /* Figure F.21: Decoding nonzero value v */
+      /* Figure F.22: Decoding the sign of v */
+      sign = arith_decode(cinfo, st + 1);
+      st += 2; st += sign;
+      /* Figure F.23: Decoding the magnitude category of v */
+      if ((m = arith_decode(cinfo, st)) != 0) {
+	st = entropy->dc_stats[tbl] + 20;	/* Table F.4: X1 = 20 */
+	while (arith_decode(cinfo, st)) {
+	  if ((m <<= 1) == 0x8000) {
+	    WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+	    entropy->ct = -1;			/* magnitude overflow */
+	    return TRUE;
+	  }
+	  st += 1;
+	}
+      }
+      /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
+      if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+	entropy->dc_context[ci] = 0;		   /* zero diff category */
+      else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+	entropy->dc_context[ci] = 12 + (sign * 4); /* large diff category */
+      else
+	entropy->dc_context[ci] = 4 + (sign * 4);  /* small diff category */
+      v = m;
+      /* Figure F.24: Decoding the magnitude bit pattern of v */
+      st += 14;
+      while (m >>= 1)
+	if (arith_decode(cinfo, st)) v |= m;
+      v += 1; if (sign) v = -v;
+      entropy->last_dc_val[ci] += v;
+    }
+
+    (*block)[0] = (JCOEF) entropy->last_dc_val[ci];
+
+    /* Sections F.2.4.2 & F.1.4.4.2: Decoding of AC coefficients */
+
+    if (cinfo->lim_Se == 0) continue;
+    tbl = compptr->ac_tbl_no;
+    k = 0;
+
+    /* Figure F.20: Decode_AC_coefficients */
+    do {
+      st = entropy->ac_stats[tbl] + 3 * k;
+      if (arith_decode(cinfo, st)) break;	/* EOB flag */
+      for (;;) {
+	k++;
+	if (arith_decode(cinfo, st + 1)) break;
+	st += 3;
+	if (k >= cinfo->lim_Se) {
+	  WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+	  entropy->ct = -1;			/* spectral overflow */
+	  return TRUE;
+	}
+      }
+      /* Figure F.21: Decoding nonzero value v */
+      /* Figure F.22: Decoding the sign of v */
+      sign = arith_decode(cinfo, entropy->fixed_bin);
+      st += 2;
+      /* Figure F.23: Decoding the magnitude category of v */
+      if ((m = arith_decode(cinfo, st)) != 0) {
+	if (arith_decode(cinfo, st)) {
+	  m <<= 1;
+	  st = entropy->ac_stats[tbl] +
+	       (k <= cinfo->arith_ac_K[tbl] ? 189 : 217);
+	  while (arith_decode(cinfo, st)) {
+	    if ((m <<= 1) == 0x8000) {
+	      WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+	      entropy->ct = -1;			/* magnitude overflow */
+	      return TRUE;
+	    }
+	    st += 1;
+	  }
+	}
+      }
+      v = m;
+      /* Figure F.24: Decoding the magnitude bit pattern of v */
+      st += 14;
+      while (m >>= 1)
+	if (arith_decode(cinfo, st)) v |= m;
+      v += 1; if (sign) v = -v;
+      (*block)[natural_order[k]] = (JCOEF) v;
+    } while (k < cinfo->lim_Se);
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * Initialize for an arithmetic-compressed scan.
+ */
+
+METHODDEF(void)
+start_pass (j_decompress_ptr cinfo)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  int ci, tbl;
+  jpeg_component_info * compptr;
+
+  if (cinfo->progressive_mode) {
+    /* Validate progressive scan parameters */
+    if (cinfo->Ss == 0) {
+      if (cinfo->Se != 0)
+	goto bad;
+    } else {
+      /* need not check Ss/Se < 0 since they came from unsigned bytes */
+      if (cinfo->Se < cinfo->Ss || cinfo->Se > cinfo->lim_Se)
+	goto bad;
+      /* AC scans may have only one component */
+      if (cinfo->comps_in_scan != 1)
+	goto bad;
+    }
+    if (cinfo->Ah != 0) {
+      /* Successive approximation refinement scan: must have Al = Ah-1. */
+      if (cinfo->Ah-1 != cinfo->Al)
+	goto bad;
+    }
+    if (cinfo->Al > 13) {	/* need not check for < 0 */
+      bad:
+      ERREXIT4(cinfo, JERR_BAD_PROGRESSION,
+	       cinfo->Ss, cinfo->Se, cinfo->Ah, cinfo->Al);
+    }
+    /* Update progression status, and verify that scan order is legal.
+     * Note that inter-scan inconsistencies are treated as warnings
+     * not fatal errors ... not clear if this is right way to behave.
+     */
+    for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+      int coefi, cindex = cinfo->cur_comp_info[ci]->component_index;
+      int *coef_bit_ptr = & cinfo->coef_bits[cindex][0];
+      if (cinfo->Ss && coef_bit_ptr[0] < 0) /* AC without prior DC scan */
+	WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0);
+      for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) {
+	int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi];
+	if (cinfo->Ah != expected)
+	  WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, coefi);
+	coef_bit_ptr[coefi] = cinfo->Al;
+      }
+    }
+    /* Select MCU decoding routine */
+    if (cinfo->Ah == 0) {
+      if (cinfo->Ss == 0)
+	entropy->pub.decode_mcu = decode_mcu_DC_first;
+      else
+	entropy->pub.decode_mcu = decode_mcu_AC_first;
+    } else {
+      if (cinfo->Ss == 0)
+	entropy->pub.decode_mcu = decode_mcu_DC_refine;
+      else
+	entropy->pub.decode_mcu = decode_mcu_AC_refine;
+    }
+  } else {
+    /* Check that the scan parameters Ss, Se, Ah/Al are OK for sequential JPEG.
+     * This ought to be an error condition, but we make it a warning.
+     */
+    if (cinfo->Ss != 0 || cinfo->Ah != 0 || cinfo->Al != 0 ||
+	(cinfo->Se < DCTSIZE2 && cinfo->Se != cinfo->lim_Se))
+      WARNMS(cinfo, JWRN_NOT_SEQUENTIAL);
+    /* Select MCU decoding routine */
+    entropy->pub.decode_mcu = decode_mcu;
+  }
+
+  /* Allocate & initialize requested statistics areas */
+  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+    compptr = cinfo->cur_comp_info[ci];
+    if (! cinfo->progressive_mode || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
+      tbl = compptr->dc_tbl_no;
+      if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
+	ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
+      if (entropy->dc_stats[tbl] == NULL)
+	entropy->dc_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
+	  ((j_common_ptr) cinfo, JPOOL_IMAGE, DC_STAT_BINS);
+      MEMZERO(entropy->dc_stats[tbl], DC_STAT_BINS);
+      /* Initialize DC predictions to 0 */
+      entropy->last_dc_val[ci] = 0;
+      entropy->dc_context[ci] = 0;
+    }
+    if ((! cinfo->progressive_mode && cinfo->lim_Se) ||
+	(cinfo->progressive_mode && cinfo->Ss)) {
+      tbl = compptr->ac_tbl_no;
+      if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
+	ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
+      if (entropy->ac_stats[tbl] == NULL)
+	entropy->ac_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
+	  ((j_common_ptr) cinfo, JPOOL_IMAGE, AC_STAT_BINS);
+      MEMZERO(entropy->ac_stats[tbl], AC_STAT_BINS);
+    }
+  }
+
+  /* Initialize arithmetic decoding variables */
+  entropy->c = 0;
+  entropy->a = 0;
+  entropy->ct = -16;	/* force reading 2 initial bytes to fill C */
+
+  /* Initialize restart counter */
+  entropy->restarts_to_go = cinfo->restart_interval;
+}
+
+
+/*
+ * Module initialization routine for arithmetic entropy decoding.
+ */
+
+GLOBAL(void)
+jinit_arith_decoder (j_decompress_ptr cinfo)
+{
+  arith_entropy_ptr entropy;
+  int i;
+
+  entropy = (arith_entropy_ptr)
+    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+				SIZEOF(arith_entropy_decoder));
+  cinfo->entropy = (struct jpeg_entropy_decoder *) entropy;
+  entropy->pub.start_pass = start_pass;
+
+  /* Mark tables unallocated */
+  for (i = 0; i < NUM_ARITH_TBLS; i++) {
+    entropy->dc_stats[i] = NULL;
+    entropy->ac_stats[i] = NULL;
+  }
+
+  /* Initialize index for fixed probability estimation */
+  entropy->fixed_bin[0] = 113;
+
+  if (cinfo->progressive_mode) {
+    /* Create progression status table */
+    int *coef_bit_ptr, ci;
+    cinfo->coef_bits = (int (*)[DCTSIZE2])
+      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+				  cinfo->num_components*DCTSIZE2*SIZEOF(int));
+    coef_bit_ptr = & cinfo->coef_bits[0][0];
+    for (ci = 0; ci < cinfo->num_components; ci++) 
+      for (i = 0; i < DCTSIZE2; i++)
+	*coef_bit_ptr++ = -1;
+  }
+}
diff --git a/3rdparty/libjpeg/jdatadst.c b/3rdparty/libjpeg/jdatadst.c
index a8f6fb0e02..6981fb73de 100644
--- a/3rdparty/libjpeg/jdatadst.c
+++ b/3rdparty/libjpeg/jdatadst.c
@@ -2,13 +2,14 @@
  * jdatadst.c
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Modified 2009-2012 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
  * This file contains compression data destination routines for the case of
- * emitting JPEG data to a file (or any stdio stream).  While these routines
- * are sufficient for most applications, some will want to use a different
- * destination manager.
+ * emitting JPEG data to memory or to a file (or any stdio stream).
+ * While these routines are sufficient for most applications,
+ * some will want to use a different destination manager.
  * IMPORTANT: we assume that fwrite() will correctly transcribe an array of
  * JOCTETs into 8-bit-wide elements on external storage.  If char is wider
  * than 8 bits on your machine, you may need to do some tweaking.
@@ -19,6 +20,11 @@
 #include "jpeglib.h"
 #include "jerror.h"
 
+#ifndef HAVE_STDLIB_H		/* <stdlib.h> should declare malloc(),free() */
+extern void * malloc JPP((size_t size));
+extern void free JPP((void *ptr));
+#endif
+
 
 /* Expanded data destination object for stdio output */
 
@@ -34,6 +40,21 @@ typedef my_destination_mgr * my_dest_ptr;
 #define OUTPUT_BUF_SIZE  4096	/* choose an efficiently fwrite'able size */
 
 
+/* Expanded data destination object for memory output */
+
+typedef struct {
+  struct jpeg_destination_mgr pub; /* public fields */
+
+  unsigned char ** outbuffer;	/* target buffer */
+  unsigned long * outsize;
+  unsigned char * newbuffer;	/* newly allocated buffer */
+  JOCTET * buffer;		/* start of buffer */
+  size_t bufsize;
+} my_mem_destination_mgr;
+
+typedef my_mem_destination_mgr * my_mem_dest_ptr;
+
+
 /*
  * Initialize destination --- called by jpeg_start_compress
  * before any data is actually written.
@@ -53,6 +74,12 @@ init_destination (j_compress_ptr cinfo)
   dest->pub.free_in_buffer = OUTPUT_BUF_SIZE;
 }
 
+METHODDEF(void)
+init_mem_destination (j_compress_ptr cinfo)
+{
+  /* no work necessary here */
+}
+
 
 /*
  * Empty the output buffer --- called whenever buffer fills up.
@@ -92,6 +119,36 @@ empty_output_buffer (j_compress_ptr cinfo)
   return TRUE;
 }
 
+METHODDEF(boolean)
+empty_mem_output_buffer (j_compress_ptr cinfo)
+{
+  size_t nextsize;
+  JOCTET * nextbuffer;
+  my_mem_dest_ptr dest = (my_mem_dest_ptr) cinfo->dest;
+
+  /* Try to allocate new buffer with double size */
+  nextsize = dest->bufsize * 2;
+  nextbuffer = (JOCTET *) malloc(nextsize);
+
+  if (nextbuffer == NULL)
+    ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 10);
+
+  MEMCOPY(nextbuffer, dest->buffer, dest->bufsize);
+
+  if (dest->newbuffer != NULL)
+    free(dest->newbuffer);
+
+  dest->newbuffer = nextbuffer;
+
+  dest->pub.next_output_byte = nextbuffer + dest->bufsize;
+  dest->pub.free_in_buffer = dest->bufsize;
+
+  dest->buffer = nextbuffer;
+  dest->bufsize = nextsize;
+
+  return TRUE;
+}
+
 
 /*
  * Terminate destination --- called by jpeg_finish_compress
@@ -119,6 +176,15 @@ term_destination (j_compress_ptr cinfo)
     ERREXIT(cinfo, JERR_FILE_WRITE);
 }
 
+METHODDEF(void)
+term_mem_destination (j_compress_ptr cinfo)
+{
+  my_mem_dest_ptr dest = (my_mem_dest_ptr) cinfo->dest;
+
+  *dest->outbuffer = dest->buffer;
+  *dest->outsize = dest->bufsize - dest->pub.free_in_buffer;
+}
+
 
 /*
  * Prepare for output to a stdio stream.
@@ -149,3 +215,53 @@ jpeg_stdio_dest (j_compress_ptr cinfo, FILE * outfile)
   dest->pub.term_destination = term_destination;
   dest->outfile = outfile;
 }
+
+
+/*
+ * Prepare for output to a memory buffer.
+ * The caller may supply an own initial buffer with appropriate size.
+ * Otherwise, or when the actual data output exceeds the given size,
+ * the library adapts the buffer size as necessary.
+ * The standard library functions malloc/free are used for allocating
+ * larger memory, so the buffer is available to the application after
+ * finishing compression, and then the application is responsible for
+ * freeing the requested memory.
+ */
+
+GLOBAL(void)
+jpeg_mem_dest (j_compress_ptr cinfo,
+	       unsigned char ** outbuffer, unsigned long * outsize)
+{
+  my_mem_dest_ptr dest;
+
+  if (outbuffer == NULL || outsize == NULL)	/* sanity check */
+    ERREXIT(cinfo, JERR_BUFFER_SIZE);
+
+  /* The destination object is made permanent so that multiple JPEG images
+   * can be written to the same buffer without re-executing jpeg_mem_dest.
+   */
+  if (cinfo->dest == NULL) {	/* first time for this JPEG object? */
+    cinfo->dest = (struct jpeg_destination_mgr *)
+      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+				  SIZEOF(my_mem_destination_mgr));
+  }
+
+  dest = (my_mem_dest_ptr) cinfo->dest;
+  dest->pub.init_destination = init_mem_destination;
+  dest->pub.empty_output_buffer = empty_mem_output_buffer;
+  dest->pub.term_destination = term_mem_destination;
+  dest->outbuffer = outbuffer;
+  dest->outsize = outsize;
+  dest->newbuffer = NULL;
+
+  if (*outbuffer == NULL || *outsize == 0) {
+    /* Allocate initial buffer */
+    dest->newbuffer = *outbuffer = (unsigned char *) malloc(OUTPUT_BUF_SIZE);
+    if (dest->newbuffer == NULL)
+      ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 10);
+    *outsize = OUTPUT_BUF_SIZE;
+  }
+
+  dest->pub.next_output_byte = dest->buffer = *outbuffer;
+  dest->pub.free_in_buffer = dest->bufsize = *outsize;
+}
diff --git a/3rdparty/libjpeg/jdatasrc.c b/3rdparty/libjpeg/jdatasrc.c
index edc752bf5d..7be59a88a1 100644
--- a/3rdparty/libjpeg/jdatasrc.c
+++ b/3rdparty/libjpeg/jdatasrc.c
@@ -2,13 +2,14 @@
  * jdatasrc.c
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Modified 2009-2011 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
  * This file contains decompression data source routines for the case of
- * reading JPEG data from a file (or any stdio stream).  While these routines
- * are sufficient for most applications, some will want to use a different
- * source manager.
+ * reading JPEG data from memory or from a file (or any stdio stream).
+ * While these routines are sufficient for most applications,
+ * some will want to use a different source manager.
  * IMPORTANT: we assume that fread() will correctly transcribe an array of
  * JOCTETs from 8-bit-wide elements on external storage.  If char is wider
  * than 8 bits on your machine, you may need to do some tweaking.
@@ -52,6 +53,12 @@ init_source (j_decompress_ptr cinfo)
   src->start_of_file = TRUE;
 }
 
+METHODDEF(void)
+init_mem_source (j_decompress_ptr cinfo)
+{
+  /* no work necessary here */
+}
+
 
 /*
  * Fill the input buffer --- called whenever buffer is emptied.
@@ -111,6 +118,27 @@ fill_input_buffer (j_decompress_ptr cinfo)
   return TRUE;
 }
 
+METHODDEF(boolean)
+fill_mem_input_buffer (j_decompress_ptr cinfo)
+{
+  static const JOCTET mybuffer[4] = {
+    (JOCTET) 0xFF, (JOCTET) JPEG_EOI, 0, 0
+  };
+
+  /* The whole JPEG data is expected to reside in the supplied memory
+   * buffer, so any request for more data beyond the given buffer size
+   * is treated as an error.
+   */
+  WARNMS(cinfo, JWRN_JPEG_EOF);
+
+  /* Insert a fake EOI marker */
+
+  cinfo->src->next_input_byte = mybuffer;
+  cinfo->src->bytes_in_buffer = 2;
+
+  return TRUE;
+}
+
 
 /*
  * Skip data --- used to skip over a potentially large amount of
@@ -127,22 +155,22 @@ fill_input_buffer (j_decompress_ptr cinfo)
 METHODDEF(void)
 skip_input_data (j_decompress_ptr cinfo, long num_bytes)
 {
-  my_src_ptr src = (my_src_ptr) cinfo->src;
+  struct jpeg_source_mgr * src = cinfo->src;
 
   /* Just a dumb implementation for now.  Could use fseek() except
    * it doesn't work on pipes.  Not clear that being smart is worth
    * any trouble anyway --- large skips are infrequent.
    */
   if (num_bytes > 0) {
-    while (num_bytes > (long) src->pub.bytes_in_buffer) {
-      num_bytes -= (long) src->pub.bytes_in_buffer;
-      (void) fill_input_buffer(cinfo);
+    while (num_bytes > (long) src->bytes_in_buffer) {
+      num_bytes -= (long) src->bytes_in_buffer;
+      (void) (*src->fill_input_buffer) (cinfo);
       /* note we assume that fill_input_buffer will never return FALSE,
        * so suspension need not be handled.
        */
     }
-    src->pub.next_input_byte += (size_t) num_bytes;
-    src->pub.bytes_in_buffer -= (size_t) num_bytes;
+    src->next_input_byte += (size_t) num_bytes;
+    src->bytes_in_buffer -= (size_t) num_bytes;
   }
 }
 
@@ -210,3 +238,38 @@ jpeg_stdio_src (j_decompress_ptr cinfo, FILE * infile)
   src->pub.bytes_in_buffer = 0; /* forces fill_input_buffer on first read */
   src->pub.next_input_byte = NULL; /* until buffer loaded */
 }
+
+
+/*
+ * Prepare for input from a supplied memory buffer.
+ * The buffer must contain the whole JPEG data.
+ */
+
+GLOBAL(void)
+jpeg_mem_src (j_decompress_ptr cinfo,
+	      unsigned char * inbuffer, unsigned long insize)
+{
+  struct jpeg_source_mgr * src;
+
+  if (inbuffer == NULL || insize == 0)	/* Treat empty input as fatal error */
+    ERREXIT(cinfo, JERR_INPUT_EMPTY);
+
+  /* The source object is made permanent so that a series of JPEG images
+   * can be read from the same buffer by calling jpeg_mem_src only before
+   * the first one.
+   */
+  if (cinfo->src == NULL) {	/* first time for this JPEG object? */
+    cinfo->src = (struct jpeg_source_mgr *)
+      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+				  SIZEOF(struct jpeg_source_mgr));
+  }
+
+  src = cinfo->src;
+  src->init_source = init_mem_source;
+  src->fill_input_buffer = fill_mem_input_buffer;
+  src->skip_input_data = skip_input_data;
+  src->resync_to_restart = jpeg_resync_to_restart; /* use default method */
+  src->term_source = term_source;
+  src->bytes_in_buffer = (size_t) insize;
+  src->next_input_byte = (JOCTET *) inbuffer;
+}
diff --git a/3rdparty/libjpeg/jdcoefct.c b/3rdparty/libjpeg/jdcoefct.c
index 4938d20fcb..ed02fc378f 100644
--- a/3rdparty/libjpeg/jdcoefct.c
+++ b/3rdparty/libjpeg/jdcoefct.c
@@ -2,6 +2,7 @@
  * jdcoefct.c
  *
  * Copyright (C) 1994-1997, Thomas G. Lane.
+ * Modified 2002-2011 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -162,8 +163,9 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
     for (MCU_col_num = coef->MCU_ctr; MCU_col_num <= last_MCU_col;
 	 MCU_col_num++) {
       /* Try to fetch an MCU.  Entropy decoder expects buffer to be zeroed. */
-      jzero_far((void FAR *) coef->MCU_buffer[0],
-		(size_t) (cinfo->blocks_in_MCU * SIZEOF(JBLOCK)));
+      if (cinfo->lim_Se)	/* can bypass in DC only case */
+	FMEMZERO((void FAR *) coef->MCU_buffer[0],
+		 (size_t) (cinfo->blocks_in_MCU * SIZEOF(JBLOCK)));
       if (! (*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
 	/* Suspension forced; update state counters and exit */
 	coef->MCU_vert_offset = yoffset;
@@ -187,7 +189,7 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 	useful_width = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
 						    : compptr->last_col_width;
 	output_ptr = output_buf[compptr->component_index] +
-	  yoffset * compptr->DCT_scaled_size;
+	  yoffset * compptr->DCT_v_scaled_size;
 	start_col = MCU_col_num * compptr->MCU_sample_width;
 	for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
 	  if (cinfo->input_iMCU_row < last_iMCU_row ||
@@ -197,11 +199,11 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 	      (*inverse_DCT) (cinfo, compptr,
 			      (JCOEFPTR) coef->MCU_buffer[blkn+xindex],
 			      output_ptr, output_col);
-	      output_col += compptr->DCT_scaled_size;
+	      output_col += compptr->DCT_h_scaled_size;
 	    }
 	  }
 	  blkn += compptr->MCU_width;
-	  output_ptr += compptr->DCT_scaled_size;
+	  output_ptr += compptr->DCT_v_scaled_size;
 	}
       }
     }
@@ -362,9 +364,9 @@ decompress_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 	(*inverse_DCT) (cinfo, compptr, (JCOEFPTR) buffer_ptr,
 			output_ptr, output_col);
 	buffer_ptr++;
-	output_col += compptr->DCT_scaled_size;
+	output_col += compptr->DCT_h_scaled_size;
       }
-      output_ptr += compptr->DCT_scaled_size;
+      output_ptr += compptr->DCT_v_scaled_size;
     }
   }
 
@@ -654,9 +656,9 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 	DC4 = DC5; DC5 = DC6;
 	DC7 = DC8; DC8 = DC9;
 	buffer_ptr++, prev_block_row++, next_block_row++;
-	output_col += compptr->DCT_scaled_size;
+	output_col += compptr->DCT_h_scaled_size;
       }
-      output_ptr += compptr->DCT_scaled_size;
+      output_ptr += compptr->DCT_v_scaled_size;
     }
   }
 
@@ -729,6 +731,9 @@ jinit_d_coef_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
     for (i = 0; i < D_MAX_BLOCKS_IN_MCU; i++) {
       coef->MCU_buffer[i] = buffer + i;
     }
+    if (cinfo->lim_Se == 0)	/* DC only case: want to bypass later */
+      FMEMZERO((void FAR *) buffer,
+	       (size_t) (D_MAX_BLOCKS_IN_MCU * SIZEOF(JBLOCK)));
     coef->pub.consume_data = dummy_consume_data;
     coef->pub.decompress_data = decompress_onepass;
     coef->pub.coef_arrays = NULL; /* flag for no virtual arrays */
diff --git a/3rdparty/libjpeg/jdcolor.c b/3rdparty/libjpeg/jdcolor.c
index 6c04dfe8aa..83e4d069ab 100644
--- a/3rdparty/libjpeg/jdcolor.c
+++ b/3rdparty/libjpeg/jdcolor.c
@@ -2,6 +2,7 @@
  * jdcolor.c
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2011 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -23,20 +24,28 @@ typedef struct {
   int * Cb_b_tab;		/* => table for Cb to B conversion */
   INT32 * Cr_g_tab;		/* => table for Cr to G conversion */
   INT32 * Cb_g_tab;		/* => table for Cb to G conversion */
+
+  /* Private state for RGB->Y conversion */
+  INT32 * rgb_y_tab;		/* => table for RGB to Y conversion */
 } my_color_deconverter;
 
 typedef my_color_deconverter * my_cconvert_ptr;
 
 
 /**************** YCbCr -> RGB conversion: most common case **************/
+/****************   RGB -> Y   conversion: less common case **************/
 
 /*
  * YCbCr is defined per CCIR 601-1, except that Cb and Cr are
  * normalized to the range 0..MAXJSAMPLE rather than -0.5 .. 0.5.
  * The conversion equations to be implemented are therefore
+ *
  *	R = Y                + 1.40200 * Cr
  *	G = Y - 0.34414 * Cb - 0.71414 * Cr
  *	B = Y + 1.77200 * Cb
+ *
+ *	Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ *
  * where Cb and Cr represent the incoming values less CENTERJSAMPLE.
  * (These numbers are derived from TIFF 6.0 section 21, dated 3-June-92.)
  *
@@ -61,6 +70,18 @@ typedef my_color_deconverter * my_cconvert_ptr;
 #define ONE_HALF	((INT32) 1 << (SCALEBITS-1))
 #define FIX(x)		((INT32) ((x) * (1L<<SCALEBITS) + 0.5))
 
+/* We allocate one big table for RGB->Y conversion and divide it up into
+ * three parts, instead of doing three alloc_small requests.  This lets us
+ * use a single table base address, which can be held in a register in the
+ * inner loops on many machines (more than can hold all three addresses,
+ * anyway).
+ */
+
+#define R_Y_OFF		0			/* offset to R => Y section */
+#define G_Y_OFF		(1*(MAXJSAMPLE+1))	/* offset to G => Y section */
+#define B_Y_OFF		(2*(MAXJSAMPLE+1))	/* etc. */
+#define TABLE_SIZE	(3*(MAXJSAMPLE+1))
+
 
 /*
  * Initialize tables for YCC->RGB colorspace conversion.
@@ -160,6 +181,98 @@ ycc_rgb_convert (j_decompress_ptr cinfo,
 /**************** Cases other than YCbCr -> RGB **************/
 
 
+/*
+ * Initialize for RGB->grayscale colorspace conversion.
+ */
+
+LOCAL(void)
+build_rgb_y_table (j_decompress_ptr cinfo)
+{
+  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  INT32 * rgb_y_tab;
+  INT32 i;
+
+  /* Allocate and fill in the conversion tables. */
+  cconvert->rgb_y_tab = rgb_y_tab = (INT32 *)
+    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+				(TABLE_SIZE * SIZEOF(INT32)));
+
+  for (i = 0; i <= MAXJSAMPLE; i++) {
+    rgb_y_tab[i+R_Y_OFF] = FIX(0.29900) * i;
+    rgb_y_tab[i+G_Y_OFF] = FIX(0.58700) * i;
+    rgb_y_tab[i+B_Y_OFF] = FIX(0.11400) * i + ONE_HALF;
+  }
+}
+
+
+/*
+ * Convert RGB to grayscale.
+ */
+
+METHODDEF(void)
+rgb_gray_convert (j_decompress_ptr cinfo,
+		  JSAMPIMAGE input_buf, JDIMENSION input_row,
+		  JSAMPARRAY output_buf, int num_rows)
+{
+  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  register int r, g, b;
+  register INT32 * ctab = cconvert->rgb_y_tab;
+  register JSAMPROW outptr;
+  register JSAMPROW inptr0, inptr1, inptr2;
+  register JDIMENSION col;
+  JDIMENSION num_cols = cinfo->output_width;
+
+  while (--num_rows >= 0) {
+    inptr0 = input_buf[0][input_row];
+    inptr1 = input_buf[1][input_row];
+    inptr2 = input_buf[2][input_row];
+    input_row++;
+    outptr = *output_buf++;
+    for (col = 0; col < num_cols; col++) {
+      r = GETJSAMPLE(inptr0[col]);
+      g = GETJSAMPLE(inptr1[col]);
+      b = GETJSAMPLE(inptr2[col]);
+      /* Y */
+      outptr[col] = (JSAMPLE)
+		((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
+		 >> SCALEBITS);
+    }
+  }
+}
+
+
+/*
+ * No colorspace change, but conversion from separate-planes
+ * to interleaved representation.
+ */
+
+METHODDEF(void)
+rgb_convert (j_decompress_ptr cinfo,
+	     JSAMPIMAGE input_buf, JDIMENSION input_row,
+	     JSAMPARRAY output_buf, int num_rows)
+{
+  register JSAMPROW outptr;
+  register JSAMPROW inptr0, inptr1, inptr2;
+  register JDIMENSION col;
+  JDIMENSION num_cols = cinfo->output_width;
+
+  while (--num_rows >= 0) {
+    inptr0 = input_buf[0][input_row];
+    inptr1 = input_buf[1][input_row];
+    inptr2 = input_buf[2][input_row];
+    input_row++;
+    outptr = *output_buf++;
+    for (col = 0; col < num_cols; col++) {
+      /* We can dispense with GETJSAMPLE() here */
+      outptr[RGB_RED]   = inptr0[col];
+      outptr[RGB_GREEN] = inptr1[col];
+      outptr[RGB_BLUE]  = inptr2[col];
+      outptr += RGB_PIXELSIZE;
+    }
+  }
+}
+
+
 /*
  * Color conversion for no colorspace change: just copy the data,
  * converting from separate-planes to interleaved representation.
@@ -351,6 +464,9 @@ jinit_color_deconverter (j_decompress_ptr cinfo)
       /* For color->grayscale conversion, only the Y (0) component is needed */
       for (ci = 1; ci < cinfo->num_components; ci++)
 	cinfo->comp_info[ci].component_needed = FALSE;
+    } else if (cinfo->jpeg_color_space == JCS_RGB) {
+      cconvert->pub.color_convert = rgb_gray_convert;
+      build_rgb_y_table(cinfo);
     } else
       ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
     break;
@@ -362,8 +478,8 @@ jinit_color_deconverter (j_decompress_ptr cinfo)
       build_ycc_rgb_table(cinfo);
     } else if (cinfo->jpeg_color_space == JCS_GRAYSCALE) {
       cconvert->pub.color_convert = gray_rgb_convert;
-    } else if (cinfo->jpeg_color_space == JCS_RGB && RGB_PIXELSIZE == 3) {
-      cconvert->pub.color_convert = null_convert;
+    } else if (cinfo->jpeg_color_space == JCS_RGB) {
+      cconvert->pub.color_convert = rgb_convert;
     } else
       ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
     break;
diff --git a/3rdparty/libjpeg/jdct.h b/3rdparty/libjpeg/jdct.h
index 04192a266a..360dec80c9 100644
--- a/3rdparty/libjpeg/jdct.h
+++ b/3rdparty/libjpeg/jdct.h
@@ -14,11 +14,16 @@
 
 
 /*
- * A forward DCT routine is given a pointer to a work area of type DCTELEM[];
- * the DCT is to be performed in-place in that buffer.  Type DCTELEM is int
- * for 8-bit samples, INT32 for 12-bit samples.  (NOTE: Floating-point DCT
- * implementations use an array of type FAST_FLOAT, instead.)
- * The DCT inputs are expected to be signed (range +-CENTERJSAMPLE).
+ * A forward DCT routine is given a pointer to an input sample array and
+ * a pointer to a work area of type DCTELEM[]; the DCT is to be performed
+ * in-place in that buffer.  Type DCTELEM is int for 8-bit samples, INT32
+ * for 12-bit samples.  (NOTE: Floating-point DCT implementations use an
+ * array of type FAST_FLOAT, instead.)
+ * The input data is to be fetched from the sample array starting at a
+ * specified column.  (Any row offset needed will be applied to the array
+ * pointer before it is passed to the FDCT code.)
+ * Note that the number of samples fetched by the FDCT routine is
+ * DCT_h_scaled_size * DCT_v_scaled_size.
  * The DCT outputs are returned scaled up by a factor of 8; they therefore
  * have a range of +-8K for 8-bit data, +-128K for 12-bit data.  This
  * convention improves accuracy in integer implementations and saves some
@@ -32,8 +37,12 @@ typedef int DCTELEM;		/* 16 or 32 bits is fine */
 typedef INT32 DCTELEM;		/* must have 32 bits */
 #endif
 
-typedef JMETHOD(void, forward_DCT_method_ptr, (DCTELEM * data));
-typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data));
+typedef JMETHOD(void, forward_DCT_method_ptr, (DCTELEM * data,
+					       JSAMPARRAY sample_data,
+					       JDIMENSION start_col));
+typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data,
+					     JSAMPARRAY sample_data,
+					     JDIMENSION start_col));
 
 
 /*
@@ -44,7 +53,7 @@ typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data));
  * sample array starting at a specified column.  (Any row offset needed will
  * be applied to the array pointer before it is passed to the IDCT code.)
  * Note that the number of samples emitted by the IDCT routine is
- * DCT_scaled_size * DCT_scaled_size.
+ * DCT_h_scaled_size * DCT_v_scaled_size.
  */
 
 /* typedef inverse_DCT_method_ptr is declared in jpegint.h */
@@ -84,19 +93,143 @@ typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */
 #define jpeg_fdct_islow		jFDislow
 #define jpeg_fdct_ifast		jFDifast
 #define jpeg_fdct_float		jFDfloat
+#define jpeg_fdct_7x7		jFD7x7
+#define jpeg_fdct_6x6		jFD6x6
+#define jpeg_fdct_5x5		jFD5x5
+#define jpeg_fdct_4x4		jFD4x4
+#define jpeg_fdct_3x3		jFD3x3
+#define jpeg_fdct_2x2		jFD2x2
+#define jpeg_fdct_1x1		jFD1x1
+#define jpeg_fdct_9x9		jFD9x9
+#define jpeg_fdct_10x10		jFD10x10
+#define jpeg_fdct_11x11		jFD11x11
+#define jpeg_fdct_12x12		jFD12x12
+#define jpeg_fdct_13x13		jFD13x13
+#define jpeg_fdct_14x14		jFD14x14
+#define jpeg_fdct_15x15		jFD15x15
+#define jpeg_fdct_16x16		jFD16x16
+#define jpeg_fdct_16x8		jFD16x8
+#define jpeg_fdct_14x7		jFD14x7
+#define jpeg_fdct_12x6		jFD12x6
+#define jpeg_fdct_10x5		jFD10x5
+#define jpeg_fdct_8x4		jFD8x4
+#define jpeg_fdct_6x3		jFD6x3
+#define jpeg_fdct_4x2		jFD4x2
+#define jpeg_fdct_2x1		jFD2x1
+#define jpeg_fdct_8x16		jFD8x16
+#define jpeg_fdct_7x14		jFD7x14
+#define jpeg_fdct_6x12		jFD6x12
+#define jpeg_fdct_5x10		jFD5x10
+#define jpeg_fdct_4x8		jFD4x8
+#define jpeg_fdct_3x6		jFD3x6
+#define jpeg_fdct_2x4		jFD2x4
+#define jpeg_fdct_1x2		jFD1x2
 #define jpeg_idct_islow		jRDislow
 #define jpeg_idct_ifast		jRDifast
 #define jpeg_idct_float		jRDfloat
+#define jpeg_idct_7x7		jRD7x7
+#define jpeg_idct_6x6		jRD6x6
+#define jpeg_idct_5x5		jRD5x5
 #define jpeg_idct_4x4		jRD4x4
+#define jpeg_idct_3x3		jRD3x3
 #define jpeg_idct_2x2		jRD2x2
 #define jpeg_idct_1x1		jRD1x1
+#define jpeg_idct_9x9		jRD9x9
+#define jpeg_idct_10x10		jRD10x10
+#define jpeg_idct_11x11		jRD11x11
+#define jpeg_idct_12x12		jRD12x12
+#define jpeg_idct_13x13		jRD13x13
+#define jpeg_idct_14x14		jRD14x14
+#define jpeg_idct_15x15		jRD15x15
+#define jpeg_idct_16x16		jRD16x16
+#define jpeg_idct_16x8		jRD16x8
+#define jpeg_idct_14x7		jRD14x7
+#define jpeg_idct_12x6		jRD12x6
+#define jpeg_idct_10x5		jRD10x5
+#define jpeg_idct_8x4		jRD8x4
+#define jpeg_idct_6x3		jRD6x3
+#define jpeg_idct_4x2		jRD4x2
+#define jpeg_idct_2x1		jRD2x1
+#define jpeg_idct_8x16		jRD8x16
+#define jpeg_idct_7x14		jRD7x14
+#define jpeg_idct_6x12		jRD6x12
+#define jpeg_idct_5x10		jRD5x10
+#define jpeg_idct_4x8		jRD4x8
+#define jpeg_idct_3x6		jRD3x8
+#define jpeg_idct_2x4		jRD2x4
+#define jpeg_idct_1x2		jRD1x2
 #endif /* NEED_SHORT_EXTERNAL_NAMES */
 
 /* Extern declarations for the forward and inverse DCT routines. */
 
-EXTERN(void) jpeg_fdct_islow JPP((DCTELEM * data));
-EXTERN(void) jpeg_fdct_ifast JPP((DCTELEM * data));
-EXTERN(void) jpeg_fdct_float JPP((FAST_FLOAT * data));
+EXTERN(void) jpeg_fdct_islow
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_ifast
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_float
+    JPP((FAST_FLOAT * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_7x7
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_6x6
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_5x5
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_4x4
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_3x3
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_2x2
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_1x1
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_9x9
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_10x10
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_11x11
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_12x12
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_13x13
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_14x14
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_15x15
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_16x16
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_16x8
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_14x7
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_12x6
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_10x5
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_8x4
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_6x3
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_4x2
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_2x1
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_8x16
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_7x14
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_6x12
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_5x10
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_4x8
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_3x6
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_2x4
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_1x2
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
 
 EXTERN(void) jpeg_idct_islow
     JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
@@ -107,15 +240,99 @@ EXTERN(void) jpeg_idct_ifast
 EXTERN(void) jpeg_idct_float
     JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
 	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_7x7
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_6x6
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_5x5
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
 EXTERN(void) jpeg_idct_4x4
     JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
 	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_3x3
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
 EXTERN(void) jpeg_idct_2x2
     JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
 	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
 EXTERN(void) jpeg_idct_1x1
     JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
 	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_9x9
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_10x10
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_11x11
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_12x12
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_13x13
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_14x14
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_15x15
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_16x16
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_16x8
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_14x7
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_12x6
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_10x5
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_8x4
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_6x3
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_4x2
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_2x1
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_8x16
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_7x14
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_6x12
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_5x10
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_4x8
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_3x6
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_2x4
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_1x2
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
 
 
 /*
diff --git a/3rdparty/libjpeg/jddctmgr.c b/3rdparty/libjpeg/jddctmgr.c
index bbf8d0e92f..0ded9d5741 100644
--- a/3rdparty/libjpeg/jddctmgr.c
+++ b/3rdparty/libjpeg/jddctmgr.c
@@ -2,6 +2,7 @@
  * jddctmgr.c
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Modified 2002-2010 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -98,22 +99,134 @@ start_pass (j_decompress_ptr cinfo)
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
     /* Select the proper IDCT routine for this component's scaling */
-    switch (compptr->DCT_scaled_size) {
+    switch ((compptr->DCT_h_scaled_size << 8) + compptr->DCT_v_scaled_size) {
 #ifdef IDCT_SCALING_SUPPORTED
-    case 1:
+    case ((1 << 8) + 1):
       method_ptr = jpeg_idct_1x1;
-      method = JDCT_ISLOW;	/* jidctred uses islow-style table */
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
       break;
-    case 2:
+    case ((2 << 8) + 2):
       method_ptr = jpeg_idct_2x2;
-      method = JDCT_ISLOW;	/* jidctred uses islow-style table */
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
       break;
-    case 4:
+    case ((3 << 8) + 3):
+      method_ptr = jpeg_idct_3x3;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((4 << 8) + 4):
       method_ptr = jpeg_idct_4x4;
-      method = JDCT_ISLOW;	/* jidctred uses islow-style table */
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((5 << 8) + 5):
+      method_ptr = jpeg_idct_5x5;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((6 << 8) + 6):
+      method_ptr = jpeg_idct_6x6;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((7 << 8) + 7):
+      method_ptr = jpeg_idct_7x7;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((9 << 8) + 9):
+      method_ptr = jpeg_idct_9x9;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((10 << 8) + 10):
+      method_ptr = jpeg_idct_10x10;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((11 << 8) + 11):
+      method_ptr = jpeg_idct_11x11;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((12 << 8) + 12):
+      method_ptr = jpeg_idct_12x12;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((13 << 8) + 13):
+      method_ptr = jpeg_idct_13x13;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((14 << 8) + 14):
+      method_ptr = jpeg_idct_14x14;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((15 << 8) + 15):
+      method_ptr = jpeg_idct_15x15;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((16 << 8) + 16):
+      method_ptr = jpeg_idct_16x16;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((16 << 8) + 8):
+      method_ptr = jpeg_idct_16x8;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((14 << 8) + 7):
+      method_ptr = jpeg_idct_14x7;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((12 << 8) + 6):
+      method_ptr = jpeg_idct_12x6;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((10 << 8) + 5):
+      method_ptr = jpeg_idct_10x5;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((8 << 8) + 4):
+      method_ptr = jpeg_idct_8x4;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((6 << 8) + 3):
+      method_ptr = jpeg_idct_6x3;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((4 << 8) + 2):
+      method_ptr = jpeg_idct_4x2;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((2 << 8) + 1):
+      method_ptr = jpeg_idct_2x1;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((8 << 8) + 16):
+      method_ptr = jpeg_idct_8x16;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((7 << 8) + 14):
+      method_ptr = jpeg_idct_7x14;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((6 << 8) + 12):
+      method_ptr = jpeg_idct_6x12;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((5 << 8) + 10):
+      method_ptr = jpeg_idct_5x10;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((4 << 8) + 8):
+      method_ptr = jpeg_idct_4x8;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((3 << 8) + 6):
+      method_ptr = jpeg_idct_3x6;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((2 << 8) + 4):
+      method_ptr = jpeg_idct_2x4;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((1 << 8) + 2):
+      method_ptr = jpeg_idct_1x2;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
       break;
 #endif
-    case DCTSIZE:
+    case ((DCTSIZE << 8) + DCTSIZE):
       switch (cinfo->dct_method) {
 #ifdef DCT_ISLOW_SUPPORTED
       case JDCT_ISLOW:
@@ -139,7 +252,8 @@ start_pass (j_decompress_ptr cinfo)
       }
       break;
     default:
-      ERREXIT1(cinfo, JERR_BAD_DCTSIZE, compptr->DCT_scaled_size);
+      ERREXIT2(cinfo, JERR_BAD_DCTSIZE,
+	       compptr->DCT_h_scaled_size, compptr->DCT_v_scaled_size);
       break;
     }
     idct->pub.inverse_DCT[ci] = method_ptr;
@@ -211,6 +325,7 @@ start_pass (j_decompress_ptr cinfo)
 	 * coefficients scaled by scalefactor[row]*scalefactor[col], where
 	 *   scalefactor[0] = 1
 	 *   scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
+	 * We apply a further scale factor of 1/8.
 	 */
 	FLOAT_MULT_TYPE * fmtbl = (FLOAT_MULT_TYPE *) compptr->dct_table;
 	int row, col;
@@ -224,7 +339,7 @@ start_pass (j_decompress_ptr cinfo)
 	  for (col = 0; col < DCTSIZE; col++) {
 	    fmtbl[i] = (FLOAT_MULT_TYPE)
 	      ((double) qtbl->quantval[i] *
-	       aanscalefactor[row] * aanscalefactor[col]);
+	       aanscalefactor[row] * aanscalefactor[col] * 0.125);
 	    i++;
 	  }
 	}
diff --git a/3rdparty/libjpeg/jdhuff.c b/3rdparty/libjpeg/jdhuff.c
index b5ba39f736..06f92fe47f 100644
--- a/3rdparty/libjpeg/jdhuff.c
+++ b/3rdparty/libjpeg/jdhuff.c
@@ -2,10 +2,12 @@
  * jdhuff.c
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2006-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
  * This file contains Huffman entropy decoding routines.
+ * Both sequential and progressive modes are supported in this single module.
  *
  * Much of the complexity here has to do with supporting input suspension.
  * If the data source module demands suspension, we want to be able to back
@@ -17,7 +19,173 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jdhuff.h"		/* Declarations shared with jdphuff.c */
+
+
+/* Derived data constructed for each Huffman table */
+
+#define HUFF_LOOKAHEAD	8	/* # of bits of lookahead */
+
+typedef struct {
+  /* Basic tables: (element [0] of each array is unused) */
+  INT32 maxcode[18];		/* largest code of length k (-1 if none) */
+  /* (maxcode[17] is a sentinel to ensure jpeg_huff_decode terminates) */
+  INT32 valoffset[17];		/* huffval[] offset for codes of length k */
+  /* valoffset[k] = huffval[] index of 1st symbol of code length k, less
+   * the smallest code of length k; so given a code of length k, the
+   * corresponding symbol is huffval[code + valoffset[k]]
+   */
+
+  /* Link to public Huffman table (needed only in jpeg_huff_decode) */
+  JHUFF_TBL *pub;
+
+  /* Lookahead tables: indexed by the next HUFF_LOOKAHEAD bits of
+   * the input data stream.  If the next Huffman code is no more
+   * than HUFF_LOOKAHEAD bits long, we can obtain its length and
+   * the corresponding symbol directly from these tables.
+   */
+  int look_nbits[1<<HUFF_LOOKAHEAD]; /* # bits, or 0 if too long */
+  UINT8 look_sym[1<<HUFF_LOOKAHEAD]; /* symbol, or unused */
+} d_derived_tbl;
+
+
+/*
+ * Fetching the next N bits from the input stream is a time-critical operation
+ * for the Huffman decoders.  We implement it with a combination of inline
+ * macros and out-of-line subroutines.  Note that N (the number of bits
+ * demanded at one time) never exceeds 15 for JPEG use.
+ *
+ * We read source bytes into get_buffer and dole out bits as needed.
+ * If get_buffer already contains enough bits, they are fetched in-line
+ * by the macros CHECK_BIT_BUFFER and GET_BITS.  When there aren't enough
+ * bits, jpeg_fill_bit_buffer is called; it will attempt to fill get_buffer
+ * as full as possible (not just to the number of bits needed; this
+ * prefetching reduces the overhead cost of calling jpeg_fill_bit_buffer).
+ * Note that jpeg_fill_bit_buffer may return FALSE to indicate suspension.
+ * On TRUE return, jpeg_fill_bit_buffer guarantees that get_buffer contains
+ * at least the requested number of bits --- dummy zeroes are inserted if
+ * necessary.
+ */
+
+typedef INT32 bit_buf_type;	/* type of bit-extraction buffer */
+#define BIT_BUF_SIZE  32	/* size of buffer in bits */
+
+/* If long is > 32 bits on your machine, and shifting/masking longs is
+ * reasonably fast, making bit_buf_type be long and setting BIT_BUF_SIZE
+ * appropriately should be a win.  Unfortunately we can't define the size
+ * with something like  #define BIT_BUF_SIZE (sizeof(bit_buf_type)*8)
+ * because not all machines measure sizeof in 8-bit bytes.
+ */
+
+typedef struct {		/* Bitreading state saved across MCUs */
+  bit_buf_type get_buffer;	/* current bit-extraction buffer */
+  int bits_left;		/* # of unused bits in it */
+} bitread_perm_state;
+
+typedef struct {		/* Bitreading working state within an MCU */
+  /* Current data source location */
+  /* We need a copy, rather than munging the original, in case of suspension */
+  const JOCTET * next_input_byte; /* => next byte to read from source */
+  size_t bytes_in_buffer;	/* # of bytes remaining in source buffer */
+  /* Bit input buffer --- note these values are kept in register variables,
+   * not in this struct, inside the inner loops.
+   */
+  bit_buf_type get_buffer;	/* current bit-extraction buffer */
+  int bits_left;		/* # of unused bits in it */
+  /* Pointer needed by jpeg_fill_bit_buffer. */
+  j_decompress_ptr cinfo;	/* back link to decompress master record */
+} bitread_working_state;
+
+/* Macros to declare and load/save bitread local variables. */
+#define BITREAD_STATE_VARS  \
+	register bit_buf_type get_buffer;  \
+	register int bits_left;  \
+	bitread_working_state br_state
+
+#define BITREAD_LOAD_STATE(cinfop,permstate)  \
+	br_state.cinfo = cinfop; \
+	br_state.next_input_byte = cinfop->src->next_input_byte; \
+	br_state.bytes_in_buffer = cinfop->src->bytes_in_buffer; \
+	get_buffer = permstate.get_buffer; \
+	bits_left = permstate.bits_left;
+
+#define BITREAD_SAVE_STATE(cinfop,permstate)  \
+	cinfop->src->next_input_byte = br_state.next_input_byte; \
+	cinfop->src->bytes_in_buffer = br_state.bytes_in_buffer; \
+	permstate.get_buffer = get_buffer; \
+	permstate.bits_left = bits_left
+
+/*
+ * These macros provide the in-line portion of bit fetching.
+ * Use CHECK_BIT_BUFFER to ensure there are N bits in get_buffer
+ * before using GET_BITS, PEEK_BITS, or DROP_BITS.
+ * The variables get_buffer and bits_left are assumed to be locals,
+ * but the state struct might not be (jpeg_huff_decode needs this).
+ *	CHECK_BIT_BUFFER(state,n,action);
+ *		Ensure there are N bits in get_buffer; if suspend, take action.
+ *      val = GET_BITS(n);
+ *		Fetch next N bits.
+ *      val = PEEK_BITS(n);
+ *		Fetch next N bits without removing them from the buffer.
+ *	DROP_BITS(n);
+ *		Discard next N bits.
+ * The value N should be a simple variable, not an expression, because it
+ * is evaluated multiple times.
+ */
+
+#define CHECK_BIT_BUFFER(state,nbits,action) \
+	{ if (bits_left < (nbits)) {  \
+	    if (! jpeg_fill_bit_buffer(&(state),get_buffer,bits_left,nbits))  \
+	      { action; }  \
+	    get_buffer = (state).get_buffer; bits_left = (state).bits_left; } }
+
+#define GET_BITS(nbits) \
+	(((int) (get_buffer >> (bits_left -= (nbits)))) & BIT_MASK(nbits))
+
+#define PEEK_BITS(nbits) \
+	(((int) (get_buffer >> (bits_left -  (nbits)))) & BIT_MASK(nbits))
+
+#define DROP_BITS(nbits) \
+	(bits_left -= (nbits))
+
+
+/*
+ * Code for extracting next Huffman-coded symbol from input bit stream.
+ * Again, this is time-critical and we make the main paths be macros.
+ *
+ * We use a lookahead table to process codes of up to HUFF_LOOKAHEAD bits
+ * without looping.  Usually, more than 95% of the Huffman codes will be 8
+ * or fewer bits long.  The few overlength codes are handled with a loop,
+ * which need not be inline code.
+ *
+ * Notes about the HUFF_DECODE macro:
+ * 1. Near the end of the data segment, we may fail to get enough bits
+ *    for a lookahead.  In that case, we do it the hard way.
+ * 2. If the lookahead table contains no entry, the next code must be
+ *    more than HUFF_LOOKAHEAD bits long.
+ * 3. jpeg_huff_decode returns -1 if forced to suspend.
+ */
+
+#define HUFF_DECODE(result,state,htbl,failaction,slowlabel) \
+{ register int nb, look; \
+  if (bits_left < HUFF_LOOKAHEAD) { \
+    if (! jpeg_fill_bit_buffer(&state,get_buffer,bits_left, 0)) {failaction;} \
+    get_buffer = state.get_buffer; bits_left = state.bits_left; \
+    if (bits_left < HUFF_LOOKAHEAD) { \
+      nb = 1; goto slowlabel; \
+    } \
+  } \
+  look = PEEK_BITS(HUFF_LOOKAHEAD); \
+  if ((nb = htbl->look_nbits[look]) != 0) { \
+    DROP_BITS(nb); \
+    result = htbl->look_sym[look]; \
+  } else { \
+    nb = HUFF_LOOKAHEAD+1; \
+slowlabel: \
+    if ((result=jpeg_huff_decode(&state,get_buffer,bits_left,htbl,nb)) < 0) \
+	{ failaction; } \
+    get_buffer = state.get_buffer; bits_left = state.bits_left; \
+  } \
+}
 
 
 /*
@@ -28,7 +196,8 @@
  */
 
 typedef struct {
-  int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
+  unsigned int EOBRUN;			/* remaining EOBs in EOBRUN */
+  int last_dc_val[MAX_COMPS_IN_SCAN];	/* last DC coef for each component */
 } savable_state;
 
 /* This macro is to work around compilers with missing or broken
@@ -41,7 +210,8 @@ typedef struct {
 #else
 #if MAX_COMPS_IN_SCAN == 4
 #define ASSIGN_STATE(dest,src)  \
-	((dest).last_dc_val[0] = (src).last_dc_val[0], \
+	((dest).EOBRUN = (src).EOBRUN, \
+	 (dest).last_dc_val[0] = (src).last_dc_val[0], \
 	 (dest).last_dc_val[1] = (src).last_dc_val[1], \
 	 (dest).last_dc_val[2] = (src).last_dc_val[2], \
 	 (dest).last_dc_val[3] = (src).last_dc_val[3])
@@ -59,8 +229,18 @@ typedef struct {
   savable_state saved;		/* Other state at start of MCU */
 
   /* These fields are NOT loaded into local working state. */
+  boolean insufficient_data;	/* set TRUE after emitting warning */
   unsigned int restarts_to_go;	/* MCUs left in this restart interval */
 
+  /* Following two fields used only in progressive mode */
+
+  /* Pointers to derived tables (these workspaces have image lifespan) */
+  d_derived_tbl * derived_tbls[NUM_HUFF_TBLS];
+
+  d_derived_tbl * ac_derived_tbl; /* active table during an AC scan */
+
+  /* Following fields used only in sequential mode */
+
   /* Pointers to derived tables (these workspaces have image lifespan) */
   d_derived_tbl * dc_derived_tbls[NUM_HUFF_TBLS];
   d_derived_tbl * ac_derived_tbls[NUM_HUFF_TBLS];
@@ -71,81 +251,75 @@ typedef struct {
   d_derived_tbl * dc_cur_tbls[D_MAX_BLOCKS_IN_MCU];
   d_derived_tbl * ac_cur_tbls[D_MAX_BLOCKS_IN_MCU];
   /* Whether we care about the DC and AC coefficient values for each block */
-  boolean dc_needed[D_MAX_BLOCKS_IN_MCU];
-  boolean ac_needed[D_MAX_BLOCKS_IN_MCU];
+  int coef_limit[D_MAX_BLOCKS_IN_MCU];
 } huff_entropy_decoder;
 
 typedef huff_entropy_decoder * huff_entropy_ptr;
 
 
-/*
- * Initialize for a Huffman-compressed scan.
- */
-
-METHODDEF(void)
-start_pass_huff_decoder (j_decompress_ptr cinfo)
-{
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
-  int ci, blkn, dctbl, actbl;
-  jpeg_component_info * compptr;
-
-  /* Check that the scan parameters Ss, Se, Ah/Al are OK for sequential JPEG.
-   * This ought to be an error condition, but we make it a warning because
-   * there are some baseline files out there with all zeroes in these bytes.
-   */
-  if (cinfo->Ss != 0 || cinfo->Se != DCTSIZE2-1 ||
-      cinfo->Ah != 0 || cinfo->Al != 0)
-    WARNMS(cinfo, JWRN_NOT_SEQUENTIAL);
-
-  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
-    compptr = cinfo->cur_comp_info[ci];
-    dctbl = compptr->dc_tbl_no;
-    actbl = compptr->ac_tbl_no;
-    /* Compute derived values for Huffman tables */
-    /* We may do this more than once for a table, but it's not expensive */
-    jpeg_make_d_derived_tbl(cinfo, TRUE, dctbl,
-			    & entropy->dc_derived_tbls[dctbl]);
-    jpeg_make_d_derived_tbl(cinfo, FALSE, actbl,
-			    & entropy->ac_derived_tbls[actbl]);
-    /* Initialize DC predictions to 0 */
-    entropy->saved.last_dc_val[ci] = 0;
-  }
-
-  /* Precalculate decoding info for each block in an MCU of this scan */
-  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
-    ci = cinfo->MCU_membership[blkn];
-    compptr = cinfo->cur_comp_info[ci];
-    /* Precalculate which table to use for each block */
-    entropy->dc_cur_tbls[blkn] = entropy->dc_derived_tbls[compptr->dc_tbl_no];
-    entropy->ac_cur_tbls[blkn] = entropy->ac_derived_tbls[compptr->ac_tbl_no];
-    /* Decide whether we really care about the coefficient values */
-    if (compptr->component_needed) {
-      entropy->dc_needed[blkn] = TRUE;
-      /* we don't need the ACs if producing a 1/8th-size image */
-      entropy->ac_needed[blkn] = (compptr->DCT_scaled_size > 1);
-    } else {
-      entropy->dc_needed[blkn] = entropy->ac_needed[blkn] = FALSE;
-    }
-  }
-
-  /* Initialize bitread state variables */
-  entropy->bitstate.bits_left = 0;
-  entropy->bitstate.get_buffer = 0; /* unnecessary, but keeps Purify quiet */
-  entropy->pub.insufficient_data = FALSE;
-
-  /* Initialize restart counter */
-  entropy->restarts_to_go = cinfo->restart_interval;
-}
+static const int jpeg_zigzag_order[8][8] = {
+  {  0,  1,  5,  6, 14, 15, 27, 28 },
+  {  2,  4,  7, 13, 16, 26, 29, 42 },
+  {  3,  8, 12, 17, 25, 30, 41, 43 },
+  {  9, 11, 18, 24, 31, 40, 44, 53 },
+  { 10, 19, 23, 32, 39, 45, 52, 54 },
+  { 20, 22, 33, 38, 46, 51, 55, 60 },
+  { 21, 34, 37, 47, 50, 56, 59, 61 },
+  { 35, 36, 48, 49, 57, 58, 62, 63 }
+};
+
+static const int jpeg_zigzag_order7[7][7] = {
+  {  0,  1,  5,  6, 14, 15, 27 },
+  {  2,  4,  7, 13, 16, 26, 28 },
+  {  3,  8, 12, 17, 25, 29, 38 },
+  {  9, 11, 18, 24, 30, 37, 39 },
+  { 10, 19, 23, 31, 36, 40, 45 },
+  { 20, 22, 32, 35, 41, 44, 46 },
+  { 21, 33, 34, 42, 43, 47, 48 }
+};
+
+static const int jpeg_zigzag_order6[6][6] = {
+  {  0,  1,  5,  6, 14, 15 },
+  {  2,  4,  7, 13, 16, 25 },
+  {  3,  8, 12, 17, 24, 26 },
+  {  9, 11, 18, 23, 27, 32 },
+  { 10, 19, 22, 28, 31, 33 },
+  { 20, 21, 29, 30, 34, 35 }
+};
+
+static const int jpeg_zigzag_order5[5][5] = {
+  {  0,  1,  5,  6, 14 },
+  {  2,  4,  7, 13, 15 },
+  {  3,  8, 12, 16, 21 },
+  {  9, 11, 17, 20, 22 },
+  { 10, 18, 19, 23, 24 }
+};
+
+static const int jpeg_zigzag_order4[4][4] = {
+  { 0,  1,  5,  6 },
+  { 2,  4,  7, 12 },
+  { 3,  8, 11, 13 },
+  { 9, 10, 14, 15 }
+};
+
+static const int jpeg_zigzag_order3[3][3] = {
+  { 0, 1, 5 },
+  { 2, 4, 6 },
+  { 3, 7, 8 }
+};
+
+static const int jpeg_zigzag_order2[2][2] = {
+  { 0, 1 },
+  { 2, 3 }
+};
 
 
 /*
  * Compute the derived values for a Huffman table.
  * This routine also performs some validation checks on the table.
- *
- * Note this is also used by jdphuff.c.
  */
 
-GLOBAL(void)
+LOCAL(void)
 jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
 			 d_derived_tbl ** pdtbl)
 {
@@ -267,8 +441,7 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
 
 
 /*
- * Out-of-line code for bit fetching (shared with jdphuff.c).
- * See jdhuff.h for info about usage.
+ * Out-of-line code for bit fetching.
  * Note: current values of get_buffer and bits_left are passed as parameters,
  * but are returned in the corresponding fields of the state struct.
  *
@@ -288,7 +461,7 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
 #endif
 
 
-GLOBAL(boolean)
+LOCAL(boolean)
 jpeg_fill_bit_buffer (bitread_working_state * state,
 		      register bit_buf_type get_buffer, register int bits_left,
 		      int nbits)
@@ -369,9 +542,9 @@ jpeg_fill_bit_buffer (bitread_working_state * state,
        * We use a nonvolatile flag to ensure that only one warning message
        * appears per data segment.
        */
-      if (! cinfo->entropy->insufficient_data) {
+      if (! ((huff_entropy_ptr) cinfo->entropy)->insufficient_data) {
 	WARNMS(cinfo, JWRN_HIT_MARKER);
-	cinfo->entropy->insufficient_data = TRUE;
+	((huff_entropy_ptr) cinfo->entropy)->insufficient_data = TRUE;
       }
       /* Fill the buffer with zero bits */
       get_buffer <<= MIN_GET_BITS - bits_left;
@@ -389,12 +562,33 @@ jpeg_fill_bit_buffer (bitread_working_state * state,
 }
 
 
+/*
+ * Figure F.12: extend sign bit.
+ * On some machines, a shift and sub will be faster than a table lookup.
+ */
+
+#ifdef AVOID_TABLES
+
+#define BIT_MASK(nbits)   ((1<<(nbits))-1)
+#define HUFF_EXTEND(x,s)  ((x) < (1<<((s)-1)) ? (x) - ((1<<(s))-1) : (x))
+
+#else
+
+#define BIT_MASK(nbits)   bmask[nbits]
+#define HUFF_EXTEND(x,s)  ((x) <= bmask[(s) - 1] ? (x) - bmask[s] : (x))
+
+static const int bmask[16] =	/* bmask[n] is mask for n rightmost bits */
+  { 0, 0x0001, 0x0003, 0x0007, 0x000F, 0x001F, 0x003F, 0x007F, 0x00FF,
+    0x01FF, 0x03FF, 0x07FF, 0x0FFF, 0x1FFF, 0x3FFF, 0x7FFF };
+
+#endif /* AVOID_TABLES */
+
+
 /*
  * Out-of-line code for Huffman code decoding.
- * See jdhuff.h for info about usage.
  */
 
-GLOBAL(int)
+LOCAL(int)
 jpeg_huff_decode (bitread_working_state * state,
 		  register bit_buf_type get_buffer, register int bits_left,
 		  d_derived_tbl * htbl, int min_bits)
@@ -433,32 +627,6 @@ jpeg_huff_decode (bitread_working_state * state,
 }
 
 
-/*
- * Figure F.12: extend sign bit.
- * On some machines, a shift and add will be faster than a table lookup.
- */
-
-#ifdef AVOID_TABLES
-
-#define HUFF_EXTEND(x,s)  ((x) < (1<<((s)-1)) ? (x) + (((-1)<<(s)) + 1) : (x))
-
-#else
-
-#define HUFF_EXTEND(x,s)  ((x) < extend_test[s] ? (x) + extend_offset[s] : (x))
-
-static const int extend_test[16] =   /* entry n is 2**(n-1) */
-  { 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
-    0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000 };
-
-static const int extend_offset[16] = /* entry n is (-1 << n) + 1 */
-  { 0, ((-1)<<1) + 1, ((-1)<<2) + 1, ((-1)<<3) + 1, ((-1)<<4) + 1,
-    ((-1)<<5) + 1, ((-1)<<6) + 1, ((-1)<<7) + 1, ((-1)<<8) + 1,
-    ((-1)<<9) + 1, ((-1)<<10) + 1, ((-1)<<11) + 1, ((-1)<<12) + 1,
-    ((-1)<<13) + 1, ((-1)<<14) + 1, ((-1)<<15) + 1 };
-
-#endif /* AVOID_TABLES */
-
-
 /*
  * Check for a restart marker & resynchronize decoder.
  * Returns FALSE if must suspend.
@@ -482,6 +650,8 @@ process_restart (j_decompress_ptr cinfo)
   /* Re-initialize DC predictions to 0 */
   for (ci = 0; ci < cinfo->comps_in_scan; ci++)
     entropy->saved.last_dc_val[ci] = 0;
+  /* Re-init EOB run count, too */
+  entropy->saved.EOBRUN = 0;
 
   /* Reset restart counter */
   entropy->restarts_to_go = cinfo->restart_interval;
@@ -492,34 +662,47 @@ process_restart (j_decompress_ptr cinfo)
    * leaving the flag set.
    */
   if (cinfo->unread_marker == 0)
-    entropy->pub.insufficient_data = FALSE;
+    entropy->insufficient_data = FALSE;
 
   return TRUE;
 }
 
 
 /*
- * Decode and return one MCU's worth of Huffman-compressed coefficients.
+ * Huffman MCU decoding.
+ * Each of these routines decodes and returns one MCU's worth of
+ * Huffman-compressed coefficients. 
  * The coefficients are reordered from zigzag order into natural array order,
  * but are not dequantized.
  *
  * The i'th block of the MCU is stored into the block pointed to by
- * MCU_data[i].  WE ASSUME THIS AREA HAS BEEN ZEROED BY THE CALLER.
+ * MCU_data[i].  WE ASSUME THIS AREA IS INITIALLY ZEROED BY THE CALLER.
  * (Wholesale zeroing is usually a little faster than retail...)
  *
- * Returns FALSE if data source requested suspension.  In that case no
+ * We return FALSE if data source requested suspension.  In that case no
  * changes have been made to permanent state.  (Exception: some output
  * coefficients may already have been assigned.  This is harmless for
- * this module, since we'll just re-assign them on the next call.)
+ * spectral selection, since we'll just re-assign them on the next call.
+ * Successive approximation AC refinement has to be more careful, however.)
+ */
+
+/*
+ * MCU decoding for DC initial scan (either spectral selection,
+ * or first pass of successive approximation).
  */
 
 METHODDEF(boolean)
-decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
-{
+decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{   
   huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
-  int blkn;
+  int Al = cinfo->Al;
+  register int s, r;
+  int blkn, ci;
+  JBLOCKROW block;
   BITREAD_STATE_VARS;
   savable_state state;
+  d_derived_tbl * tbl;
+  jpeg_component_info * compptr;
 
   /* Process restart marker if needed; may have to suspend */
   if (cinfo->restart_interval) {
@@ -531,7 +714,7 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
   /* If we've run out of data, just leave the MCU set to zeroes.
    * This way, we return uniform gray for the remainder of the segment.
    */
-  if (! entropy->pub.insufficient_data) {
+  if (! entropy->insufficient_data) {
 
     /* Load up working state */
     BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
@@ -540,79 +723,571 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
     /* Outer loop handles each block in the MCU */
 
     for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
-      JBLOCKROW block = MCU_data[blkn];
-      d_derived_tbl * dctbl = entropy->dc_cur_tbls[blkn];
-      d_derived_tbl * actbl = entropy->ac_cur_tbls[blkn];
-      register int s, k, r;
+      block = MCU_data[blkn];
+      ci = cinfo->MCU_membership[blkn];
+      compptr = cinfo->cur_comp_info[ci];
+      tbl = entropy->derived_tbls[compptr->dc_tbl_no];
 
       /* Decode a single block's worth of coefficients */
 
       /* Section F.2.2.1: decode the DC coefficient difference */
-      HUFF_DECODE(s, br_state, dctbl, return FALSE, label1);
+      HUFF_DECODE(s, br_state, tbl, return FALSE, label1);
       if (s) {
 	CHECK_BIT_BUFFER(br_state, s, return FALSE);
 	r = GET_BITS(s);
 	s = HUFF_EXTEND(r, s);
       }
 
-      if (entropy->dc_needed[blkn]) {
+      /* Convert DC difference to actual value, update last_dc_val */
+      s += state.last_dc_val[ci];
+      state.last_dc_val[ci] = s;
+      /* Scale and output the coefficient (assumes jpeg_natural_order[0]=0) */
+      (*block)[0] = (JCOEF) (s << Al);
+    }
+
+    /* Completed MCU, so update state */
+    BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+    ASSIGN_STATE(entropy->saved, state);
+  }
+
+  /* Account for restart interval (no-op if not using restarts) */
+  entropy->restarts_to_go--;
+
+  return TRUE;
+}
+
+
+/*
+ * MCU decoding for AC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{   
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  register int s, k, r;
+  unsigned int EOBRUN;
+  int Se, Al;
+  const int * natural_order;
+  JBLOCKROW block;
+  BITREAD_STATE_VARS;
+  d_derived_tbl * tbl;
+
+  /* Process restart marker if needed; may have to suspend */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      if (! process_restart(cinfo))
+	return FALSE;
+  }
+
+  /* If we've run out of data, just leave the MCU set to zeroes.
+   * This way, we return uniform gray for the remainder of the segment.
+   */
+  if (! entropy->insufficient_data) {
+
+    Se = cinfo->Se;
+    Al = cinfo->Al;
+    natural_order = cinfo->natural_order;
+
+    /* Load up working state.
+     * We can avoid loading/saving bitread state if in an EOB run.
+     */
+    EOBRUN = entropy->saved.EOBRUN;	/* only part of saved state we need */
+
+    /* There is always only one block per MCU */
+
+    if (EOBRUN > 0)		/* if it's a band of zeroes... */
+      EOBRUN--;			/* ...process it now (we do nothing) */
+    else {
+      BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+      block = MCU_data[0];
+      tbl = entropy->ac_derived_tbl;
+
+      for (k = cinfo->Ss; k <= Se; k++) {
+	HUFF_DECODE(s, br_state, tbl, return FALSE, label2);
+	r = s >> 4;
+	s &= 15;
+	if (s) {
+	  k += r;
+	  CHECK_BIT_BUFFER(br_state, s, return FALSE);
+	  r = GET_BITS(s);
+	  s = HUFF_EXTEND(r, s);
+	  /* Scale and output coefficient in natural (dezigzagged) order */
+	  (*block)[natural_order[k]] = (JCOEF) (s << Al);
+	} else {
+	  if (r == 15) {	/* ZRL */
+	    k += 15;		/* skip 15 zeroes in band */
+	  } else {		/* EOBr, run length is 2^r + appended bits */
+	    EOBRUN = 1 << r;
+	    if (r) {		/* EOBr, r > 0 */
+	      CHECK_BIT_BUFFER(br_state, r, return FALSE);
+	      r = GET_BITS(r);
+	      EOBRUN += r;
+	    }
+	    EOBRUN--;		/* this band is processed at this moment */
+	    break;		/* force end-of-band */
+	  }
+	}
+      }
+
+      BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+    }
+
+    /* Completed MCU, so update state */
+    entropy->saved.EOBRUN = EOBRUN;	/* only part of saved state we need */
+  }
+
+  /* Account for restart interval (no-op if not using restarts) */
+  entropy->restarts_to_go--;
+
+  return TRUE;
+}
+
+
+/*
+ * MCU decoding for DC successive approximation refinement scan.
+ * Note: we assume such scans can be multi-component, although the spec
+ * is not very clear on the point.
+ */
+
+METHODDEF(boolean)
+decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{   
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  int p1 = 1 << cinfo->Al;	/* 1 in the bit position being coded */
+  int blkn;
+  JBLOCKROW block;
+  BITREAD_STATE_VARS;
+
+  /* Process restart marker if needed; may have to suspend */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      if (! process_restart(cinfo))
+	return FALSE;
+  }
+
+  /* Not worth the cycles to check insufficient_data here,
+   * since we will not change the data anyway if we read zeroes.
+   */
+
+  /* Load up working state */
+  BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+
+  /* Outer loop handles each block in the MCU */
+
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    block = MCU_data[blkn];
+
+    /* Encoded data is simply the next bit of the two's-complement DC value */
+    CHECK_BIT_BUFFER(br_state, 1, return FALSE);
+    if (GET_BITS(1))
+      (*block)[0] |= p1;
+    /* Note: since we use |=, repeating the assignment later is safe */
+  }
+
+  /* Completed MCU, so update state */
+  BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+
+  /* Account for restart interval (no-op if not using restarts) */
+  entropy->restarts_to_go--;
+
+  return TRUE;
+}
+
+
+/*
+ * MCU decoding for AC successive approximation refinement scan.
+ */
+
+METHODDEF(boolean)
+decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{   
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  register int s, k, r;
+  unsigned int EOBRUN;
+  int Se, p1, m1;
+  const int * natural_order;
+  JBLOCKROW block;
+  JCOEFPTR thiscoef;
+  BITREAD_STATE_VARS;
+  d_derived_tbl * tbl;
+  int num_newnz;
+  int newnz_pos[DCTSIZE2];
+
+  /* Process restart marker if needed; may have to suspend */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      if (! process_restart(cinfo))
+	return FALSE;
+  }
+
+  /* If we've run out of data, don't modify the MCU.
+   */
+  if (! entropy->insufficient_data) {
+
+    Se = cinfo->Se;
+    p1 = 1 << cinfo->Al;	/* 1 in the bit position being coded */
+    m1 = (-1) << cinfo->Al;	/* -1 in the bit position being coded */
+    natural_order = cinfo->natural_order;
+
+    /* Load up working state */
+    BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+    EOBRUN = entropy->saved.EOBRUN; /* only part of saved state we need */
+
+    /* There is always only one block per MCU */
+    block = MCU_data[0];
+    tbl = entropy->ac_derived_tbl;
+
+    /* If we are forced to suspend, we must undo the assignments to any newly
+     * nonzero coefficients in the block, because otherwise we'd get confused
+     * next time about which coefficients were already nonzero.
+     * But we need not undo addition of bits to already-nonzero coefficients;
+     * instead, we can test the current bit to see if we already did it.
+     */
+    num_newnz = 0;
+
+    /* initialize coefficient loop counter to start of band */
+    k = cinfo->Ss;
+
+    if (EOBRUN == 0) {
+      for (; k <= Se; k++) {
+	HUFF_DECODE(s, br_state, tbl, goto undoit, label3);
+	r = s >> 4;
+	s &= 15;
+	if (s) {
+	  if (s != 1)		/* size of new coef should always be 1 */
+	    WARNMS(cinfo, JWRN_HUFF_BAD_CODE);
+	  CHECK_BIT_BUFFER(br_state, 1, goto undoit);
+	  if (GET_BITS(1))
+	    s = p1;		/* newly nonzero coef is positive */
+	  else
+	    s = m1;		/* newly nonzero coef is negative */
+	} else {
+	  if (r != 15) {
+	    EOBRUN = 1 << r;	/* EOBr, run length is 2^r + appended bits */
+	    if (r) {
+	      CHECK_BIT_BUFFER(br_state, r, goto undoit);
+	      r = GET_BITS(r);
+	      EOBRUN += r;
+	    }
+	    break;		/* rest of block is handled by EOB logic */
+	  }
+	  /* note s = 0 for processing ZRL */
+	}
+	/* Advance over already-nonzero coefs and r still-zero coefs,
+	 * appending correction bits to the nonzeroes.  A correction bit is 1
+	 * if the absolute value of the coefficient must be increased.
+	 */
+	do {
+	  thiscoef = *block + natural_order[k];
+	  if (*thiscoef != 0) {
+	    CHECK_BIT_BUFFER(br_state, 1, goto undoit);
+	    if (GET_BITS(1)) {
+	      if ((*thiscoef & p1) == 0) { /* do nothing if already set it */
+		if (*thiscoef >= 0)
+		  *thiscoef += p1;
+		else
+		  *thiscoef += m1;
+	      }
+	    }
+	  } else {
+	    if (--r < 0)
+	      break;		/* reached target zero coefficient */
+	  }
+	  k++;
+	} while (k <= Se);
+	if (s) {
+	  int pos = natural_order[k];
+	  /* Output newly nonzero coefficient */
+	  (*block)[pos] = (JCOEF) s;
+	  /* Remember its position in case we have to suspend */
+	  newnz_pos[num_newnz++] = pos;
+	}
+      }
+    }
+
+    if (EOBRUN > 0) {
+      /* Scan any remaining coefficient positions after the end-of-band
+       * (the last newly nonzero coefficient, if any).  Append a correction
+       * bit to each already-nonzero coefficient.  A correction bit is 1
+       * if the absolute value of the coefficient must be increased.
+       */
+      for (; k <= Se; k++) {
+	thiscoef = *block + natural_order[k];
+	if (*thiscoef != 0) {
+	  CHECK_BIT_BUFFER(br_state, 1, goto undoit);
+	  if (GET_BITS(1)) {
+	    if ((*thiscoef & p1) == 0) { /* do nothing if already changed it */
+	      if (*thiscoef >= 0)
+		*thiscoef += p1;
+	      else
+		*thiscoef += m1;
+	    }
+	  }
+	}
+      }
+      /* Count one block completed in EOB run */
+      EOBRUN--;
+    }
+
+    /* Completed MCU, so update state */
+    BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+    entropy->saved.EOBRUN = EOBRUN; /* only part of saved state we need */
+  }
+
+  /* Account for restart interval (no-op if not using restarts) */
+  entropy->restarts_to_go--;
+
+  return TRUE;
+
+undoit:
+  /* Re-zero any output coefficients that we made newly nonzero */
+  while (num_newnz > 0)
+    (*block)[newnz_pos[--num_newnz]] = 0;
+
+  return FALSE;
+}
+
+
+/*
+ * Decode one MCU's worth of Huffman-compressed coefficients,
+ * partial blocks.
+ */
+
+METHODDEF(boolean)
+decode_mcu_sub (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  const int * natural_order;
+  int Se, blkn;
+  BITREAD_STATE_VARS;
+  savable_state state;
+
+  /* Process restart marker if needed; may have to suspend */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      if (! process_restart(cinfo))
+	return FALSE;
+  }
+
+  /* If we've run out of data, just leave the MCU set to zeroes.
+   * This way, we return uniform gray for the remainder of the segment.
+   */
+  if (! entropy->insufficient_data) {
+
+    natural_order = cinfo->natural_order;
+    Se = cinfo->lim_Se;
+
+    /* Load up working state */
+    BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+    ASSIGN_STATE(state, entropy->saved);
+
+    /* Outer loop handles each block in the MCU */
+
+    for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+      JBLOCKROW block = MCU_data[blkn];
+      d_derived_tbl * htbl;
+      register int s, k, r;
+      int coef_limit, ci;
+
+      /* Decode a single block's worth of coefficients */
+
+      /* Section F.2.2.1: decode the DC coefficient difference */
+      htbl = entropy->dc_cur_tbls[blkn];
+      HUFF_DECODE(s, br_state, htbl, return FALSE, label1);
+
+      htbl = entropy->ac_cur_tbls[blkn];
+      k = 1;
+      coef_limit = entropy->coef_limit[blkn];
+      if (coef_limit) {
 	/* Convert DC difference to actual value, update last_dc_val */
-	int ci = cinfo->MCU_membership[blkn];
+	if (s) {
+	  CHECK_BIT_BUFFER(br_state, s, return FALSE);
+	  r = GET_BITS(s);
+	  s = HUFF_EXTEND(r, s);
+	}
+	ci = cinfo->MCU_membership[blkn];
 	s += state.last_dc_val[ci];
 	state.last_dc_val[ci] = s;
-	/* Output the DC coefficient (assumes jpeg_natural_order[0] = 0) */
+	/* Output the DC coefficient */
 	(*block)[0] = (JCOEF) s;
-      }
-
-      if (entropy->ac_needed[blkn]) {
 
 	/* Section F.2.2.2: decode the AC coefficients */
 	/* Since zeroes are skipped, output area must be cleared beforehand */
-	for (k = 1; k < DCTSIZE2; k++) {
-	  HUFF_DECODE(s, br_state, actbl, return FALSE, label2);
-      
+	for (; k < coef_limit; k++) {
+	  HUFF_DECODE(s, br_state, htbl, return FALSE, label2);
+
 	  r = s >> 4;
 	  s &= 15;
-      
+
 	  if (s) {
 	    k += r;
 	    CHECK_BIT_BUFFER(br_state, s, return FALSE);
 	    r = GET_BITS(s);
 	    s = HUFF_EXTEND(r, s);
 	    /* Output coefficient in natural (dezigzagged) order.
-	     * Note: the extra entries in jpeg_natural_order[] will save us
-	     * if k >= DCTSIZE2, which could happen if the data is corrupted.
+	     * Note: the extra entries in natural_order[] will save us
+	     * if k > Se, which could happen if the data is corrupted.
 	     */
-	    (*block)[jpeg_natural_order[k]] = (JCOEF) s;
+	    (*block)[natural_order[k]] = (JCOEF) s;
 	  } else {
 	    if (r != 15)
-	      break;
+	      goto EndOfBlock;
 	    k += 15;
 	  }
 	}
-
       } else {
+	if (s) {
+	  CHECK_BIT_BUFFER(br_state, s, return FALSE);
+	  DROP_BITS(s);
+	}
+      }
+
+      /* Section F.2.2.2: decode the AC coefficients */
+      /* In this path we just discard the values */
+      for (; k <= Se; k++) {
+	HUFF_DECODE(s, br_state, htbl, return FALSE, label3);
+
+	r = s >> 4;
+	s &= 15;
+
+	if (s) {
+	  k += r;
+	  CHECK_BIT_BUFFER(br_state, s, return FALSE);
+	  DROP_BITS(s);
+	} else {
+	  if (r != 15)
+	    break;
+	  k += 15;
+	}
+      }
+
+      EndOfBlock: ;
+    }
+
+    /* Completed MCU, so update state */
+    BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+    ASSIGN_STATE(entropy->saved, state);
+  }
+
+  /* Account for restart interval (no-op if not using restarts) */
+  entropy->restarts_to_go--;
+
+  return TRUE;
+}
+
+
+/*
+ * Decode one MCU's worth of Huffman-compressed coefficients,
+ * full-size blocks.
+ */
+
+METHODDEF(boolean)
+decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  int blkn;
+  BITREAD_STATE_VARS;
+  savable_state state;
+
+  /* Process restart marker if needed; may have to suspend */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      if (! process_restart(cinfo))
+	return FALSE;
+  }
+
+  /* If we've run out of data, just leave the MCU set to zeroes.
+   * This way, we return uniform gray for the remainder of the segment.
+   */
+  if (! entropy->insufficient_data) {
+
+    /* Load up working state */
+    BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+    ASSIGN_STATE(state, entropy->saved);
+
+    /* Outer loop handles each block in the MCU */
+
+    for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+      JBLOCKROW block = MCU_data[blkn];
+      d_derived_tbl * htbl;
+      register int s, k, r;
+      int coef_limit, ci;
+
+      /* Decode a single block's worth of coefficients */
+
+      /* Section F.2.2.1: decode the DC coefficient difference */
+      htbl = entropy->dc_cur_tbls[blkn];
+      HUFF_DECODE(s, br_state, htbl, return FALSE, label1);
+
+      htbl = entropy->ac_cur_tbls[blkn];
+      k = 1;
+      coef_limit = entropy->coef_limit[blkn];
+      if (coef_limit) {
+	/* Convert DC difference to actual value, update last_dc_val */
+	if (s) {
+	  CHECK_BIT_BUFFER(br_state, s, return FALSE);
+	  r = GET_BITS(s);
+	  s = HUFF_EXTEND(r, s);
+	}
+	ci = cinfo->MCU_membership[blkn];
+	s += state.last_dc_val[ci];
+	state.last_dc_val[ci] = s;
+	/* Output the DC coefficient */
+	(*block)[0] = (JCOEF) s;
 
 	/* Section F.2.2.2: decode the AC coefficients */
-	/* In this path we just discard the values */
-	for (k = 1; k < DCTSIZE2; k++) {
-	  HUFF_DECODE(s, br_state, actbl, return FALSE, label3);
-      
+	/* Since zeroes are skipped, output area must be cleared beforehand */
+	for (; k < coef_limit; k++) {
+	  HUFF_DECODE(s, br_state, htbl, return FALSE, label2);
+
 	  r = s >> 4;
 	  s &= 15;
-      
+
 	  if (s) {
 	    k += r;
 	    CHECK_BIT_BUFFER(br_state, s, return FALSE);
-	    DROP_BITS(s);
+	    r = GET_BITS(s);
+	    s = HUFF_EXTEND(r, s);
+	    /* Output coefficient in natural (dezigzagged) order.
+	     * Note: the extra entries in jpeg_natural_order[] will save us
+	     * if k >= DCTSIZE2, which could happen if the data is corrupted.
+	     */
+	    (*block)[jpeg_natural_order[k]] = (JCOEF) s;
 	  } else {
 	    if (r != 15)
-	      break;
+	      goto EndOfBlock;
 	    k += 15;
 	  }
 	}
+      } else {
+	if (s) {
+	  CHECK_BIT_BUFFER(br_state, s, return FALSE);
+	  DROP_BITS(s);
+	}
+      }
+
+      /* Section F.2.2.2: decode the AC coefficients */
+      /* In this path we just discard the values */
+      for (; k < DCTSIZE2; k++) {
+	HUFF_DECODE(s, br_state, htbl, return FALSE, label3);
 
+	r = s >> 4;
+	s &= 15;
+
+	if (s) {
+	  k += r;
+	  CHECK_BIT_BUFFER(br_state, s, return FALSE);
+	  DROP_BITS(s);
+	} else {
+	  if (r != 15)
+	    break;
+	  k += 15;
+	}
       }
+
+      EndOfBlock: ;
     }
 
     /* Completed MCU, so update state */
@@ -627,6 +1302,205 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 }
 
 
+/*
+ * Initialize for a Huffman-compressed scan.
+ */
+
+METHODDEF(void)
+start_pass_huff_decoder (j_decompress_ptr cinfo)
+{
+  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  int ci, blkn, tbl, i;
+  jpeg_component_info * compptr;
+
+  if (cinfo->progressive_mode) {
+    /* Validate progressive scan parameters */
+    if (cinfo->Ss == 0) {
+      if (cinfo->Se != 0)
+	goto bad;
+    } else {
+      /* need not check Ss/Se < 0 since they came from unsigned bytes */
+      if (cinfo->Se < cinfo->Ss || cinfo->Se > cinfo->lim_Se)
+	goto bad;
+      /* AC scans may have only one component */
+      if (cinfo->comps_in_scan != 1)
+	goto bad;
+    }
+    if (cinfo->Ah != 0) {
+      /* Successive approximation refinement scan: must have Al = Ah-1. */
+      if (cinfo->Ah-1 != cinfo->Al)
+	goto bad;
+    }
+    if (cinfo->Al > 13) {	/* need not check for < 0 */
+      /* Arguably the maximum Al value should be less than 13 for 8-bit precision,
+       * but the spec doesn't say so, and we try to be liberal about what we
+       * accept.  Note: large Al values could result in out-of-range DC
+       * coefficients during early scans, leading to bizarre displays due to
+       * overflows in the IDCT math.  But we won't crash.
+       */
+      bad:
+      ERREXIT4(cinfo, JERR_BAD_PROGRESSION,
+	       cinfo->Ss, cinfo->Se, cinfo->Ah, cinfo->Al);
+    }
+    /* Update progression status, and verify that scan order is legal.
+     * Note that inter-scan inconsistencies are treated as warnings
+     * not fatal errors ... not clear if this is right way to behave.
+     */
+    for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+      int coefi, cindex = cinfo->cur_comp_info[ci]->component_index;
+      int *coef_bit_ptr = & cinfo->coef_bits[cindex][0];
+      if (cinfo->Ss && coef_bit_ptr[0] < 0) /* AC without prior DC scan */
+	WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0);
+      for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) {
+	int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi];
+	if (cinfo->Ah != expected)
+	  WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, coefi);
+	coef_bit_ptr[coefi] = cinfo->Al;
+      }
+    }
+
+    /* Select MCU decoding routine */
+    if (cinfo->Ah == 0) {
+      if (cinfo->Ss == 0)
+	entropy->pub.decode_mcu = decode_mcu_DC_first;
+      else
+	entropy->pub.decode_mcu = decode_mcu_AC_first;
+    } else {
+      if (cinfo->Ss == 0)
+	entropy->pub.decode_mcu = decode_mcu_DC_refine;
+      else
+	entropy->pub.decode_mcu = decode_mcu_AC_refine;
+    }
+
+    for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+      compptr = cinfo->cur_comp_info[ci];
+      /* Make sure requested tables are present, and compute derived tables.
+       * We may build same derived table more than once, but it's not expensive.
+       */
+      if (cinfo->Ss == 0) {
+	if (cinfo->Ah == 0) {	/* DC refinement needs no table */
+	  tbl = compptr->dc_tbl_no;
+	  jpeg_make_d_derived_tbl(cinfo, TRUE, tbl,
+				  & entropy->derived_tbls[tbl]);
+	}
+      } else {
+	tbl = compptr->ac_tbl_no;
+	jpeg_make_d_derived_tbl(cinfo, FALSE, tbl,
+				& entropy->derived_tbls[tbl]);
+	/* remember the single active table */
+	entropy->ac_derived_tbl = entropy->derived_tbls[tbl];
+      }
+      /* Initialize DC predictions to 0 */
+      entropy->saved.last_dc_val[ci] = 0;
+    }
+
+    /* Initialize private state variables */
+    entropy->saved.EOBRUN = 0;
+  } else {
+    /* Check that the scan parameters Ss, Se, Ah/Al are OK for sequential JPEG.
+     * This ought to be an error condition, but we make it a warning because
+     * there are some baseline files out there with all zeroes in these bytes.
+     */
+    if (cinfo->Ss != 0 || cinfo->Ah != 0 || cinfo->Al != 0 ||
+	((cinfo->is_baseline || cinfo->Se < DCTSIZE2) &&
+	cinfo->Se != cinfo->lim_Se))
+      WARNMS(cinfo, JWRN_NOT_SEQUENTIAL);
+
+    /* Select MCU decoding routine */
+    /* We retain the hard-coded case for full-size blocks.
+     * This is not necessary, but it appears that this version is slightly
+     * more performant in the given implementation.
+     * With an improved implementation we would prefer a single optimized
+     * function.
+     */
+    if (cinfo->lim_Se != DCTSIZE2-1)
+      entropy->pub.decode_mcu = decode_mcu_sub;
+    else
+      entropy->pub.decode_mcu = decode_mcu;
+
+    for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+      compptr = cinfo->cur_comp_info[ci];
+      /* Compute derived values for Huffman tables */
+      /* We may do this more than once for a table, but it's not expensive */
+      tbl = compptr->dc_tbl_no;
+      jpeg_make_d_derived_tbl(cinfo, TRUE, tbl,
+			      & entropy->dc_derived_tbls[tbl]);
+      if (cinfo->lim_Se) {	/* AC needs no table when not present */
+	tbl = compptr->ac_tbl_no;
+	jpeg_make_d_derived_tbl(cinfo, FALSE, tbl,
+				& entropy->ac_derived_tbls[tbl]);
+      }
+      /* Initialize DC predictions to 0 */
+      entropy->saved.last_dc_val[ci] = 0;
+    }
+
+    /* Precalculate decoding info for each block in an MCU of this scan */
+    for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+      ci = cinfo->MCU_membership[blkn];
+      compptr = cinfo->cur_comp_info[ci];
+      /* Precalculate which table to use for each block */
+      entropy->dc_cur_tbls[blkn] = entropy->dc_derived_tbls[compptr->dc_tbl_no];
+      entropy->ac_cur_tbls[blkn] = entropy->ac_derived_tbls[compptr->ac_tbl_no];
+      /* Decide whether we really care about the coefficient values */
+      if (compptr->component_needed) {
+	ci = compptr->DCT_v_scaled_size;
+	i = compptr->DCT_h_scaled_size;
+	switch (cinfo->lim_Se) {
+	case (1*1-1):
+	  entropy->coef_limit[blkn] = 1;
+	  break;
+	case (2*2-1):
+	  if (ci <= 0 || ci > 2) ci = 2;
+	  if (i <= 0 || i > 2) i = 2;
+	  entropy->coef_limit[blkn] = 1 + jpeg_zigzag_order2[ci - 1][i - 1];
+	  break;
+	case (3*3-1):
+	  if (ci <= 0 || ci > 3) ci = 3;
+	  if (i <= 0 || i > 3) i = 3;
+	  entropy->coef_limit[blkn] = 1 + jpeg_zigzag_order3[ci - 1][i - 1];
+	  break;
+	case (4*4-1):
+	  if (ci <= 0 || ci > 4) ci = 4;
+	  if (i <= 0 || i > 4) i = 4;
+	  entropy->coef_limit[blkn] = 1 + jpeg_zigzag_order4[ci - 1][i - 1];
+	  break;
+	case (5*5-1):
+	  if (ci <= 0 || ci > 5) ci = 5;
+	  if (i <= 0 || i > 5) i = 5;
+	  entropy->coef_limit[blkn] = 1 + jpeg_zigzag_order5[ci - 1][i - 1];
+	  break;
+	case (6*6-1):
+	  if (ci <= 0 || ci > 6) ci = 6;
+	  if (i <= 0 || i > 6) i = 6;
+	  entropy->coef_limit[blkn] = 1 + jpeg_zigzag_order6[ci - 1][i - 1];
+	  break;
+	case (7*7-1):
+	  if (ci <= 0 || ci > 7) ci = 7;
+	  if (i <= 0 || i > 7) i = 7;
+	  entropy->coef_limit[blkn] = 1 + jpeg_zigzag_order7[ci - 1][i - 1];
+	  break;
+	default:
+	  if (ci <= 0 || ci > 8) ci = 8;
+	  if (i <= 0 || i > 8) i = 8;
+	  entropy->coef_limit[blkn] = 1 + jpeg_zigzag_order[ci - 1][i - 1];
+	  break;
+	}
+      } else {
+	entropy->coef_limit[blkn] = 0;
+      }
+    }
+  }
+
+  /* Initialize bitread state variables */
+  entropy->bitstate.bits_left = 0;
+  entropy->bitstate.get_buffer = 0; /* unnecessary, but keeps Purify quiet */
+  entropy->insufficient_data = FALSE;
+
+  /* Initialize restart counter */
+  entropy->restarts_to_go = cinfo->restart_interval;
+}
+
+
 /*
  * Module initialization routine for Huffman entropy decoding.
  */
@@ -642,10 +1516,26 @@ jinit_huff_decoder (j_decompress_ptr cinfo)
 				SIZEOF(huff_entropy_decoder));
   cinfo->entropy = (struct jpeg_entropy_decoder *) entropy;
   entropy->pub.start_pass = start_pass_huff_decoder;
-  entropy->pub.decode_mcu = decode_mcu;
 
-  /* Mark tables unallocated */
-  for (i = 0; i < NUM_HUFF_TBLS; i++) {
-    entropy->dc_derived_tbls[i] = entropy->ac_derived_tbls[i] = NULL;
+  if (cinfo->progressive_mode) {
+    /* Create progression status table */
+    int *coef_bit_ptr, ci;
+    cinfo->coef_bits = (int (*)[DCTSIZE2])
+      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+				  cinfo->num_components*DCTSIZE2*SIZEOF(int));
+    coef_bit_ptr = & cinfo->coef_bits[0][0];
+    for (ci = 0; ci < cinfo->num_components; ci++)
+      for (i = 0; i < DCTSIZE2; i++)
+	*coef_bit_ptr++ = -1;
+
+    /* Mark derived tables unallocated */
+    for (i = 0; i < NUM_HUFF_TBLS; i++) {
+      entropy->derived_tbls[i] = NULL;
+    }
+  } else {
+    /* Mark tables unallocated */
+    for (i = 0; i < NUM_HUFF_TBLS; i++) {
+      entropy->dc_derived_tbls[i] = entropy->ac_derived_tbls[i] = NULL;
+    }
   }
 }
diff --git a/3rdparty/libjpeg/jdhuff.h b/3rdparty/libjpeg/jdhuff.h
deleted file mode 100644
index ae19b6cafd..0000000000
--- a/3rdparty/libjpeg/jdhuff.h
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * jdhuff.h
- *
- * Copyright (C) 1991-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file contains declarations for Huffman entropy decoding routines
- * that are shared between the sequential decoder (jdhuff.c) and the
- * progressive decoder (jdphuff.c).  No other modules need to see these.
- */
-
-/* Short forms of external names for systems with brain-damaged linkers. */
-
-#ifdef NEED_SHORT_EXTERNAL_NAMES
-#define jpeg_make_d_derived_tbl	jMkDDerived
-#define jpeg_fill_bit_buffer	jFilBitBuf
-#define jpeg_huff_decode	jHufDecode
-#endif /* NEED_SHORT_EXTERNAL_NAMES */
-
-
-/* Derived data constructed for each Huffman table */
-
-#define HUFF_LOOKAHEAD	8	/* # of bits of lookahead */
-
-typedef struct {
-  /* Basic tables: (element [0] of each array is unused) */
-  INT32 maxcode[18];		/* largest code of length k (-1 if none) */
-  /* (maxcode[17] is a sentinel to ensure jpeg_huff_decode terminates) */
-  INT32 valoffset[17];		/* huffval[] offset for codes of length k */
-  /* valoffset[k] = huffval[] index of 1st symbol of code length k, less
-   * the smallest code of length k; so given a code of length k, the
-   * corresponding symbol is huffval[code + valoffset[k]]
-   */
-
-  /* Link to public Huffman table (needed only in jpeg_huff_decode) */
-  JHUFF_TBL *pub;
-
-  /* Lookahead tables: indexed by the next HUFF_LOOKAHEAD bits of
-   * the input data stream.  If the next Huffman code is no more
-   * than HUFF_LOOKAHEAD bits long, we can obtain its length and
-   * the corresponding symbol directly from these tables.
-   */
-  int look_nbits[1<<HUFF_LOOKAHEAD]; /* # bits, or 0 if too long */
-  UINT8 look_sym[1<<HUFF_LOOKAHEAD]; /* symbol, or unused */
-} d_derived_tbl;
-
-/* Expand a Huffman table definition into the derived format */
-EXTERN(void) jpeg_make_d_derived_tbl
-	JPP((j_decompress_ptr cinfo, boolean isDC, int tblno,
-	     d_derived_tbl ** pdtbl));
-
-
-/*
- * Fetching the next N bits from the input stream is a time-critical operation
- * for the Huffman decoders.  We implement it with a combination of inline
- * macros and out-of-line subroutines.  Note that N (the number of bits
- * demanded at one time) never exceeds 15 for JPEG use.
- *
- * We read source bytes into get_buffer and dole out bits as needed.
- * If get_buffer already contains enough bits, they are fetched in-line
- * by the macros CHECK_BIT_BUFFER and GET_BITS.  When there aren't enough
- * bits, jpeg_fill_bit_buffer is called; it will attempt to fill get_buffer
- * as full as possible (not just to the number of bits needed; this
- * prefetching reduces the overhead cost of calling jpeg_fill_bit_buffer).
- * Note that jpeg_fill_bit_buffer may return FALSE to indicate suspension.
- * On TRUE return, jpeg_fill_bit_buffer guarantees that get_buffer contains
- * at least the requested number of bits --- dummy zeroes are inserted if
- * necessary.
- */
-
-typedef INT32 bit_buf_type;	/* type of bit-extraction buffer */
-#define BIT_BUF_SIZE  32	/* size of buffer in bits */
-
-/* If long is > 32 bits on your machine, and shifting/masking longs is
- * reasonably fast, making bit_buf_type be long and setting BIT_BUF_SIZE
- * appropriately should be a win.  Unfortunately we can't define the size
- * with something like  #define BIT_BUF_SIZE (sizeof(bit_buf_type)*8)
- * because not all machines measure sizeof in 8-bit bytes.
- */
-
-typedef struct {		/* Bitreading state saved across MCUs */
-  bit_buf_type get_buffer;	/* current bit-extraction buffer */
-  int bits_left;		/* # of unused bits in it */
-} bitread_perm_state;
-
-typedef struct {		/* Bitreading working state within an MCU */
-  /* Current data source location */
-  /* We need a copy, rather than munging the original, in case of suspension */
-  const JOCTET * next_input_byte; /* => next byte to read from source */
-  size_t bytes_in_buffer;	/* # of bytes remaining in source buffer */
-  /* Bit input buffer --- note these values are kept in register variables,
-   * not in this struct, inside the inner loops.
-   */
-  bit_buf_type get_buffer;	/* current bit-extraction buffer */
-  int bits_left;		/* # of unused bits in it */
-  /* Pointer needed by jpeg_fill_bit_buffer. */
-  j_decompress_ptr cinfo;	/* back link to decompress master record */
-} bitread_working_state;
-
-/* Macros to declare and load/save bitread local variables. */
-#define BITREAD_STATE_VARS  \
-	register bit_buf_type get_buffer;  \
-	register int bits_left;  \
-	bitread_working_state br_state
-
-#define BITREAD_LOAD_STATE(cinfop,permstate)  \
-	br_state.cinfo = cinfop; \
-	br_state.next_input_byte = cinfop->src->next_input_byte; \
-	br_state.bytes_in_buffer = cinfop->src->bytes_in_buffer; \
-	get_buffer = permstate.get_buffer; \
-	bits_left = permstate.bits_left;
-
-#define BITREAD_SAVE_STATE(cinfop,permstate)  \
-	cinfop->src->next_input_byte = br_state.next_input_byte; \
-	cinfop->src->bytes_in_buffer = br_state.bytes_in_buffer; \
-	permstate.get_buffer = get_buffer; \
-	permstate.bits_left = bits_left
-
-/*
- * These macros provide the in-line portion of bit fetching.
- * Use CHECK_BIT_BUFFER to ensure there are N bits in get_buffer
- * before using GET_BITS, PEEK_BITS, or DROP_BITS.
- * The variables get_buffer and bits_left are assumed to be locals,
- * but the state struct might not be (jpeg_huff_decode needs this).
- *	CHECK_BIT_BUFFER(state,n,action);
- *		Ensure there are N bits in get_buffer; if suspend, take action.
- *      val = GET_BITS(n);
- *		Fetch next N bits.
- *      val = PEEK_BITS(n);
- *		Fetch next N bits without removing them from the buffer.
- *	DROP_BITS(n);
- *		Discard next N bits.
- * The value N should be a simple variable, not an expression, because it
- * is evaluated multiple times.
- */
-
-#define CHECK_BIT_BUFFER(state,nbits,action) \
-	{ if (bits_left < (nbits)) {  \
-	    if (! jpeg_fill_bit_buffer(&(state),get_buffer,bits_left,nbits))  \
-	      { action; }  \
-	    get_buffer = (state).get_buffer; bits_left = (state).bits_left; } }
-
-#define GET_BITS(nbits) \
-	(((int) (get_buffer >> (bits_left -= (nbits)))) & ((1<<(nbits))-1))
-
-#define PEEK_BITS(nbits) \
-	(((int) (get_buffer >> (bits_left -  (nbits)))) & ((1<<(nbits))-1))
-
-#define DROP_BITS(nbits) \
-	(bits_left -= (nbits))
-
-/* Load up the bit buffer to a depth of at least nbits */
-EXTERN(boolean) jpeg_fill_bit_buffer
-	JPP((bitread_working_state * state, register bit_buf_type get_buffer,
-	     register int bits_left, int nbits));
-
-
-/*
- * Code for extracting next Huffman-coded symbol from input bit stream.
- * Again, this is time-critical and we make the main paths be macros.
- *
- * We use a lookahead table to process codes of up to HUFF_LOOKAHEAD bits
- * without looping.  Usually, more than 95% of the Huffman codes will be 8
- * or fewer bits long.  The few overlength codes are handled with a loop,
- * which need not be inline code.
- *
- * Notes about the HUFF_DECODE macro:
- * 1. Near the end of the data segment, we may fail to get enough bits
- *    for a lookahead.  In that case, we do it the hard way.
- * 2. If the lookahead table contains no entry, the next code must be
- *    more than HUFF_LOOKAHEAD bits long.
- * 3. jpeg_huff_decode returns -1 if forced to suspend.
- */
-
-#define HUFF_DECODE(result,state,htbl,failaction,slowlabel) \
-{ register int nb, look; \
-  if (bits_left < HUFF_LOOKAHEAD) { \
-    if (! jpeg_fill_bit_buffer(&state,get_buffer,bits_left, 0)) {failaction;} \
-    get_buffer = state.get_buffer; bits_left = state.bits_left; \
-    if (bits_left < HUFF_LOOKAHEAD) { \
-      nb = 1; goto slowlabel; \
-    } \
-  } \
-  look = PEEK_BITS(HUFF_LOOKAHEAD); \
-  if ((nb = htbl->look_nbits[look]) != 0) { \
-    DROP_BITS(nb); \
-    result = htbl->look_sym[look]; \
-  } else { \
-    nb = HUFF_LOOKAHEAD+1; \
-slowlabel: \
-    if ((result=jpeg_huff_decode(&state,get_buffer,bits_left,htbl,nb)) < 0) \
-	{ failaction; } \
-    get_buffer = state.get_buffer; bits_left = state.bits_left; \
-  } \
-}
-
-/* Out-of-line case for Huffman code fetching */
-EXTERN(int) jpeg_huff_decode
-	JPP((bitread_working_state * state, register bit_buf_type get_buffer,
-	     register int bits_left, d_derived_tbl * htbl, int min_bits));
diff --git a/3rdparty/libjpeg/jdinput.c b/3rdparty/libjpeg/jdinput.c
index 0c2ac8f120..2c5c717b9c 100644
--- a/3rdparty/libjpeg/jdinput.c
+++ b/3rdparty/libjpeg/jdinput.c
@@ -2,13 +2,14 @@
  * jdinput.c
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2002-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
  * This file contains input control logic for the JPEG decompressor.
  * These routines are concerned with controlling the decompressor's input
  * processing (marker reading and coefficient decoding).  The actual input
- * reading is done in jdmarker.c, jdhuff.c, and jdphuff.c.
+ * reading is done in jdmarker.c, jdhuff.c, and jdarith.c.
  */
 
 #define JPEG_INTERNALS
@@ -21,7 +22,7 @@
 typedef struct {
   struct jpeg_input_controller pub; /* public fields */
 
-  boolean inheaders;		/* TRUE until first SOS is reached */
+  int inheaders;		/* Nonzero until first SOS is reached */
 } my_input_controller;
 
 typedef my_input_controller * my_inputctl_ptr;
@@ -35,6 +36,174 @@ METHODDEF(int) consume_markers JPP((j_decompress_ptr cinfo));
  * Routines to calculate various quantities related to the size of the image.
  */
 
+
+/*
+ * Compute output image dimensions and related values.
+ * NOTE: this is exported for possible use by application.
+ * Hence it mustn't do anything that can't be done twice.
+ */
+
+GLOBAL(void)
+jpeg_core_output_dimensions (j_decompress_ptr cinfo)
+/* Do computations that are needed before master selection phase.
+ * This function is used for transcoding and full decompression.
+ */
+{
+#ifdef IDCT_SCALING_SUPPORTED
+  int ci;
+  jpeg_component_info *compptr;
+
+  /* Compute actual output image dimensions and DCT scaling choices. */
+  if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom) {
+    /* Provide 1/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 1;
+    cinfo->min_DCT_v_scaled_size = 1;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 2) {
+    /* Provide 2/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 2L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 2L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 2;
+    cinfo->min_DCT_v_scaled_size = 2;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 3) {
+    /* Provide 3/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 3L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 3L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 3;
+    cinfo->min_DCT_v_scaled_size = 3;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 4) {
+    /* Provide 4/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 4L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 4L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 4;
+    cinfo->min_DCT_v_scaled_size = 4;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 5) {
+    /* Provide 5/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 5L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 5L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 5;
+    cinfo->min_DCT_v_scaled_size = 5;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 6) {
+    /* Provide 6/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 6L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 6L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 6;
+    cinfo->min_DCT_v_scaled_size = 6;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 7) {
+    /* Provide 7/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 7L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 7L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 7;
+    cinfo->min_DCT_v_scaled_size = 7;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 8) {
+    /* Provide 8/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 8L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 8L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 8;
+    cinfo->min_DCT_v_scaled_size = 8;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 9) {
+    /* Provide 9/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 9L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 9L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 9;
+    cinfo->min_DCT_v_scaled_size = 9;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 10) {
+    /* Provide 10/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 10L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 10L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 10;
+    cinfo->min_DCT_v_scaled_size = 10;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 11) {
+    /* Provide 11/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 11L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 11L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 11;
+    cinfo->min_DCT_v_scaled_size = 11;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 12) {
+    /* Provide 12/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 12L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 12L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 12;
+    cinfo->min_DCT_v_scaled_size = 12;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 13) {
+    /* Provide 13/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 13L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 13L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 13;
+    cinfo->min_DCT_v_scaled_size = 13;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 14) {
+    /* Provide 14/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 14L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 14L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 14;
+    cinfo->min_DCT_v_scaled_size = 14;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 15) {
+    /* Provide 15/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 15L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 15L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 15;
+    cinfo->min_DCT_v_scaled_size = 15;
+  } else {
+    /* Provide 16/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 16L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 16L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 16;
+    cinfo->min_DCT_v_scaled_size = 16;
+  }
+
+  /* Recompute dimensions of components */
+  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+       ci++, compptr++) {
+    compptr->DCT_h_scaled_size = cinfo->min_DCT_h_scaled_size;
+    compptr->DCT_v_scaled_size = cinfo->min_DCT_v_scaled_size;
+  }
+
+#else /* !IDCT_SCALING_SUPPORTED */
+
+  /* Hardwire it to "no scaling" */
+  cinfo->output_width = cinfo->image_width;
+  cinfo->output_height = cinfo->image_height;
+  /* jdinput.c has already initialized DCT_scaled_size,
+   * and has computed unscaled downsampled_width and downsampled_height.
+   */
+
+#endif /* IDCT_SCALING_SUPPORTED */
+}
+
+
 LOCAL(void)
 initial_setup (j_decompress_ptr cinfo)
 /* Called once, when first SOS marker is reached */
@@ -70,23 +239,121 @@ initial_setup (j_decompress_ptr cinfo)
 				   compptr->v_samp_factor);
   }
 
-  /* We initialize DCT_scaled_size and min_DCT_scaled_size to DCTSIZE.
-   * In the full decompressor, this will be overridden by jdmaster.c;
-   * but in the transcoder, jdmaster.c is not used, so we must do it here.
+  /* Derive block_size, natural_order, and lim_Se */
+  if (cinfo->is_baseline || (cinfo->progressive_mode &&
+      cinfo->comps_in_scan)) { /* no pseudo SOS marker */
+    cinfo->block_size = DCTSIZE;
+    cinfo->natural_order = jpeg_natural_order;
+    cinfo->lim_Se = DCTSIZE2-1;
+  } else
+    switch (cinfo->Se) {
+    case (1*1-1):
+      cinfo->block_size = 1;
+      cinfo->natural_order = jpeg_natural_order; /* not needed */
+      cinfo->lim_Se = cinfo->Se;
+      break;
+    case (2*2-1):
+      cinfo->block_size = 2;
+      cinfo->natural_order = jpeg_natural_order2;
+      cinfo->lim_Se = cinfo->Se;
+      break;
+    case (3*3-1):
+      cinfo->block_size = 3;
+      cinfo->natural_order = jpeg_natural_order3;
+      cinfo->lim_Se = cinfo->Se;
+      break;
+    case (4*4-1):
+      cinfo->block_size = 4;
+      cinfo->natural_order = jpeg_natural_order4;
+      cinfo->lim_Se = cinfo->Se;
+      break;
+    case (5*5-1):
+      cinfo->block_size = 5;
+      cinfo->natural_order = jpeg_natural_order5;
+      cinfo->lim_Se = cinfo->Se;
+      break;
+    case (6*6-1):
+      cinfo->block_size = 6;
+      cinfo->natural_order = jpeg_natural_order6;
+      cinfo->lim_Se = cinfo->Se;
+      break;
+    case (7*7-1):
+      cinfo->block_size = 7;
+      cinfo->natural_order = jpeg_natural_order7;
+      cinfo->lim_Se = cinfo->Se;
+      break;
+    case (8*8-1):
+      cinfo->block_size = 8;
+      cinfo->natural_order = jpeg_natural_order;
+      cinfo->lim_Se = DCTSIZE2-1;
+      break;
+    case (9*9-1):
+      cinfo->block_size = 9;
+      cinfo->natural_order = jpeg_natural_order;
+      cinfo->lim_Se = DCTSIZE2-1;
+      break;
+    case (10*10-1):
+      cinfo->block_size = 10;
+      cinfo->natural_order = jpeg_natural_order;
+      cinfo->lim_Se = DCTSIZE2-1;
+      break;
+    case (11*11-1):
+      cinfo->block_size = 11;
+      cinfo->natural_order = jpeg_natural_order;
+      cinfo->lim_Se = DCTSIZE2-1;
+      break;
+    case (12*12-1):
+      cinfo->block_size = 12;
+      cinfo->natural_order = jpeg_natural_order;
+      cinfo->lim_Se = DCTSIZE2-1;
+      break;
+    case (13*13-1):
+      cinfo->block_size = 13;
+      cinfo->natural_order = jpeg_natural_order;
+      cinfo->lim_Se = DCTSIZE2-1;
+      break;
+    case (14*14-1):
+      cinfo->block_size = 14;
+      cinfo->natural_order = jpeg_natural_order;
+      cinfo->lim_Se = DCTSIZE2-1;
+      break;
+    case (15*15-1):
+      cinfo->block_size = 15;
+      cinfo->natural_order = jpeg_natural_order;
+      cinfo->lim_Se = DCTSIZE2-1;
+      break;
+    case (16*16-1):
+      cinfo->block_size = 16;
+      cinfo->natural_order = jpeg_natural_order;
+      cinfo->lim_Se = DCTSIZE2-1;
+      break;
+    default:
+      ERREXIT4(cinfo, JERR_BAD_PROGRESSION,
+	       cinfo->Ss, cinfo->Se, cinfo->Ah, cinfo->Al);
+      break;
+    }
+
+  /* We initialize DCT_scaled_size and min_DCT_scaled_size to block_size.
+   * In the full decompressor,
+   * this will be overridden by jpeg_calc_output_dimensions in jdmaster.c;
+   * but in the transcoder,
+   * jpeg_calc_output_dimensions is not used, so we must do it here.
    */
-  cinfo->min_DCT_scaled_size = DCTSIZE;
+  cinfo->min_DCT_h_scaled_size = cinfo->block_size;
+  cinfo->min_DCT_v_scaled_size = cinfo->block_size;
 
   /* Compute dimensions of components */
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
-    compptr->DCT_scaled_size = DCTSIZE;
+    compptr->DCT_h_scaled_size = cinfo->block_size;
+    compptr->DCT_v_scaled_size = cinfo->block_size;
     /* Size in DCT blocks */
     compptr->width_in_blocks = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor,
-		    (long) (cinfo->max_h_samp_factor * DCTSIZE));
+		    (long) (cinfo->max_h_samp_factor * cinfo->block_size));
     compptr->height_in_blocks = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_height * (long) compptr->v_samp_factor,
-		    (long) (cinfo->max_v_samp_factor * DCTSIZE));
+		    (long) (cinfo->max_v_samp_factor * cinfo->block_size));
     /* downsampled_width and downsampled_height will also be overridden by
      * jdmaster.c if we are doing full decompression.  The transcoder library
      * doesn't use these values, but the calling application might.
@@ -107,7 +374,7 @@ initial_setup (j_decompress_ptr cinfo)
   /* Compute number of fully interleaved MCU rows. */
   cinfo->total_iMCU_rows = (JDIMENSION)
     jdiv_round_up((long) cinfo->image_height,
-		  (long) (cinfo->max_v_samp_factor*DCTSIZE));
+	          (long) (cinfo->max_v_samp_factor * cinfo->block_size));
 
   /* Decide whether file contains multiple scans */
   if (cinfo->comps_in_scan < cinfo->num_components || cinfo->progressive_mode)
@@ -138,7 +405,7 @@ per_scan_setup (j_decompress_ptr cinfo)
     compptr->MCU_width = 1;
     compptr->MCU_height = 1;
     compptr->MCU_blocks = 1;
-    compptr->MCU_sample_width = compptr->DCT_scaled_size;
+    compptr->MCU_sample_width = compptr->DCT_h_scaled_size;
     compptr->last_col_width = 1;
     /* For noninterleaved scans, it is convenient to define last_row_height
      * as the number of block rows present in the last iMCU row.
@@ -161,10 +428,10 @@ per_scan_setup (j_decompress_ptr cinfo)
     /* Overall image size in MCUs */
     cinfo->MCUs_per_row = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_width,
-		    (long) (cinfo->max_h_samp_factor*DCTSIZE));
+		    (long) (cinfo->max_h_samp_factor * cinfo->block_size));
     cinfo->MCU_rows_in_scan = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_height,
-		    (long) (cinfo->max_v_samp_factor*DCTSIZE));
+		    (long) (cinfo->max_v_samp_factor * cinfo->block_size));
     
     cinfo->blocks_in_MCU = 0;
     
@@ -174,7 +441,7 @@ per_scan_setup (j_decompress_ptr cinfo)
       compptr->MCU_width = compptr->h_samp_factor;
       compptr->MCU_height = compptr->v_samp_factor;
       compptr->MCU_blocks = compptr->MCU_width * compptr->MCU_height;
-      compptr->MCU_sample_width = compptr->MCU_width * compptr->DCT_scaled_size;
+      compptr->MCU_sample_width = compptr->MCU_width * compptr->DCT_h_scaled_size;
       /* Figure number of non-dummy blocks in last MCU column & row */
       tmp = (int) (compptr->width_in_blocks % compptr->MCU_width);
       if (tmp == 0) tmp = compptr->MCU_width;
@@ -282,6 +549,10 @@ finish_input_pass (j_decompress_ptr cinfo)
  * The consume_input method pointer points either here or to the
  * coefficient controller's consume_data routine, depending on whether
  * we are reading a compressed data segment or inter-segment markers.
+ *
+ * Note: This function should NOT return a pseudo SOS marker (with zero
+ * component number) to the caller.  A pseudo marker received by
+ * read_markers is processed and then skipped for other markers.
  */
 
 METHODDEF(int)
@@ -293,41 +564,50 @@ consume_markers (j_decompress_ptr cinfo)
   if (inputctl->pub.eoi_reached) /* After hitting EOI, read no further */
     return JPEG_REACHED_EOI;
 
-  val = (*cinfo->marker->read_markers) (cinfo);
-
-  switch (val) {
-  case JPEG_REACHED_SOS:	/* Found SOS */
-    if (inputctl->inheaders) {	/* 1st SOS */
-      initial_setup(cinfo);
-      inputctl->inheaders = FALSE;
-      /* Note: start_input_pass must be called by jdmaster.c
-       * before any more input can be consumed.  jdapimin.c is
-       * responsible for enforcing this sequencing.
-       */
-    } else {			/* 2nd or later SOS marker */
-      if (! inputctl->pub.has_multiple_scans)
-	ERREXIT(cinfo, JERR_EOI_EXPECTED); /* Oops, I wasn't expecting this! */
-      start_input_pass(cinfo);
-    }
-    break;
-  case JPEG_REACHED_EOI:	/* Found EOI */
-    inputctl->pub.eoi_reached = TRUE;
-    if (inputctl->inheaders) {	/* Tables-only datastream, apparently */
-      if (cinfo->marker->saw_SOF)
-	ERREXIT(cinfo, JERR_SOF_NO_SOS);
-    } else {
-      /* Prevent infinite loop in coef ctlr's decompress_data routine
-       * if user set output_scan_number larger than number of scans.
-       */
-      if (cinfo->output_scan_number > cinfo->input_scan_number)
-	cinfo->output_scan_number = cinfo->input_scan_number;
+  for (;;) {			/* Loop to pass pseudo SOS marker */
+    val = (*cinfo->marker->read_markers) (cinfo);
+
+    switch (val) {
+    case JPEG_REACHED_SOS:	/* Found SOS */
+      if (inputctl->inheaders) { /* 1st SOS */
+	if (inputctl->inheaders == 1)
+	  initial_setup(cinfo);
+	if (cinfo->comps_in_scan == 0) { /* pseudo SOS marker */
+	  inputctl->inheaders = 2;
+	  break;
+	}
+	inputctl->inheaders = 0;
+	/* Note: start_input_pass must be called by jdmaster.c
+	 * before any more input can be consumed.  jdapimin.c is
+	 * responsible for enforcing this sequencing.
+	 */
+      } else {			/* 2nd or later SOS marker */
+	if (! inputctl->pub.has_multiple_scans)
+	  ERREXIT(cinfo, JERR_EOI_EXPECTED); /* Oops, I wasn't expecting this! */
+	if (cinfo->comps_in_scan == 0) /* unexpected pseudo SOS marker */
+	  break;
+	start_input_pass(cinfo);
+      }
+      return val;
+    case JPEG_REACHED_EOI:	/* Found EOI */
+      inputctl->pub.eoi_reached = TRUE;
+      if (inputctl->inheaders) { /* Tables-only datastream, apparently */
+	if (cinfo->marker->saw_SOF)
+	  ERREXIT(cinfo, JERR_SOF_NO_SOS);
+      } else {
+	/* Prevent infinite loop in coef ctlr's decompress_data routine
+	 * if user set output_scan_number larger than number of scans.
+	 */
+	if (cinfo->output_scan_number > cinfo->input_scan_number)
+	  cinfo->output_scan_number = cinfo->input_scan_number;
+      }
+      return val;
+    case JPEG_SUSPENDED:
+      return val;
+    default:
+      return val;
     }
-    break;
-  case JPEG_SUSPENDED:
-    break;
   }
-
-  return val;
 }
 
 
@@ -343,7 +623,7 @@ reset_input_controller (j_decompress_ptr cinfo)
   inputctl->pub.consume_input = consume_markers;
   inputctl->pub.has_multiple_scans = FALSE; /* "unknown" would be better */
   inputctl->pub.eoi_reached = FALSE;
-  inputctl->inheaders = TRUE;
+  inputctl->inheaders = 1;
   /* Reset other modules */
   (*cinfo->err->reset_error_mgr) ((j_common_ptr) cinfo);
   (*cinfo->marker->reset_marker_reader) (cinfo);
@@ -377,5 +657,5 @@ jinit_input_controller (j_decompress_ptr cinfo)
    */
   inputctl->pub.has_multiple_scans = FALSE; /* "unknown" would be better */
   inputctl->pub.eoi_reached = FALSE;
-  inputctl->inheaders = TRUE;
+  inputctl->inheaders = 1;
 }
diff --git a/3rdparty/libjpeg/jdmainct.c b/3rdparty/libjpeg/jdmainct.c
index a5619d3330..02723ca732 100644
--- a/3rdparty/libjpeg/jdmainct.c
+++ b/3rdparty/libjpeg/jdmainct.c
@@ -5,12 +5,12 @@
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
- * This file contains the main_ptr buffer controller for decompression.
- * The main_ptr buffer lies between the JPEG decompressor proper and the
+ * This file contains the main buffer controller for decompression.
+ * The main buffer lies between the JPEG decompressor proper and the
  * post-processor; it holds downsampled data in the JPEG colorspace.
  *
  * Note that this code is bypassed in raw-data mode, since the application
- * supplies the equivalent of the main_ptr buffer in that case.
+ * supplies the equivalent of the main buffer in that case.
  */
 
 #define JPEG_INTERNALS
@@ -19,9 +19,9 @@
 
 
 /*
- * In the current system design, the main_ptr buffer need never be a full-image
+ * In the current system design, the main buffer need never be a full-image
  * buffer; any full-height buffers will be found inside the coefficient or
- * postprocessing controllers.  Nonetheless, the main_ptr controller is not
+ * postprocessing controllers.  Nonetheless, the main controller is not
  * trivial.  Its responsibility is to provide context rows for upsampling/
  * rescaling, and doing this in an efficient fashion is a bit tricky.
  *
@@ -159,24 +159,24 @@ alloc_funny_pointers (j_decompress_ptr cinfo)
  * This is done only once, not once per pass.
  */
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main = (my_main_ptr) cinfo->main;
   int ci, rgroup;
-  int M = cinfo->min_DCT_scaled_size;
+  int M = cinfo->min_DCT_v_scaled_size;
   jpeg_component_info *compptr;
   JSAMPARRAY xbuf;
 
   /* Get top-level space for component array pointers.
    * We alloc both arrays with one call to save a few cycles.
    */
-  main_ptr->xbuffer[0] = (JSAMPIMAGE)
+  main->xbuffer[0] = (JSAMPIMAGE)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
 				cinfo->num_components * 2 * SIZEOF(JSAMPARRAY));
-  main_ptr->xbuffer[1] = main_ptr->xbuffer[0] + cinfo->num_components;
+  main->xbuffer[1] = main->xbuffer[0] + cinfo->num_components;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
-    rgroup = (compptr->v_samp_factor * compptr->DCT_scaled_size) /
-      cinfo->min_DCT_scaled_size; /* height of a row group of component */
+    rgroup = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
+      cinfo->min_DCT_v_scaled_size; /* height of a row group of component */
     /* Get space for pointer lists --- M+4 row groups in each list.
      * We alloc both pointer lists with one call to save a few cycles.
      */
@@ -184,9 +184,9 @@ alloc_funny_pointers (j_decompress_ptr cinfo)
       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
 				  2 * (rgroup * (M + 4)) * SIZEOF(JSAMPROW));
     xbuf += rgroup;		/* want one row group at negative offsets */
-    main_ptr->xbuffer[0][ci] = xbuf;
+    main->xbuffer[0][ci] = xbuf;
     xbuf += rgroup * (M + 4);
-    main_ptr->xbuffer[1][ci] = xbuf;
+    main->xbuffer[1][ci] = xbuf;
   }
 }
 
@@ -194,26 +194,26 @@ alloc_funny_pointers (j_decompress_ptr cinfo)
 LOCAL(void)
 make_funny_pointers (j_decompress_ptr cinfo)
 /* Create the funny pointer lists discussed in the comments above.
- * The actual workspace is already allocated (in main_ptr->buffer),
+ * The actual workspace is already allocated (in main->buffer),
  * and the space for the pointer lists is allocated too.
  * This routine just fills in the curiously ordered lists.
  * This will be repeated at the beginning of each pass.
  */
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main = (my_main_ptr) cinfo->main;
   int ci, i, rgroup;
-  int M = cinfo->min_DCT_scaled_size;
+  int M = cinfo->min_DCT_v_scaled_size;
   jpeg_component_info *compptr;
   JSAMPARRAY buf, xbuf0, xbuf1;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
-    rgroup = (compptr->v_samp_factor * compptr->DCT_scaled_size) /
-      cinfo->min_DCT_scaled_size; /* height of a row group of component */
-    xbuf0 = main_ptr->xbuffer[0][ci];
-    xbuf1 = main_ptr->xbuffer[1][ci];
+    rgroup = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
+      cinfo->min_DCT_v_scaled_size; /* height of a row group of component */
+    xbuf0 = main->xbuffer[0][ci];
+    xbuf1 = main->xbuffer[1][ci];
     /* First copy the workspace pointers as-is */
-    buf = main_ptr->buffer[ci];
+    buf = main->buffer[ci];
     for (i = 0; i < rgroup * (M + 2); i++) {
       xbuf0[i] = xbuf1[i] = buf[i];
     }
@@ -240,18 +240,18 @@ set_wraparound_pointers (j_decompress_ptr cinfo)
  * This changes the pointer list state from top-of-image to the normal state.
  */
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main = (my_main_ptr) cinfo->main;
   int ci, i, rgroup;
-  int M = cinfo->min_DCT_scaled_size;
+  int M = cinfo->min_DCT_v_scaled_size;
   jpeg_component_info *compptr;
   JSAMPARRAY xbuf0, xbuf1;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
-    rgroup = (compptr->v_samp_factor * compptr->DCT_scaled_size) /
-      cinfo->min_DCT_scaled_size; /* height of a row group of component */
-    xbuf0 = main_ptr->xbuffer[0][ci];
-    xbuf1 = main_ptr->xbuffer[1][ci];
+    rgroup = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
+      cinfo->min_DCT_v_scaled_size; /* height of a row group of component */
+    xbuf0 = main->xbuffer[0][ci];
+    xbuf1 = main->xbuffer[1][ci];
     for (i = 0; i < rgroup; i++) {
       xbuf0[i - rgroup] = xbuf0[rgroup*(M+1) + i];
       xbuf1[i - rgroup] = xbuf1[rgroup*(M+1) + i];
@@ -269,7 +269,7 @@ set_bottom_pointers (j_decompress_ptr cinfo)
  * Also sets rowgroups_avail to indicate number of nondummy row groups in row.
  */
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main = (my_main_ptr) cinfo->main;
   int ci, i, rgroup, iMCUheight, rows_left;
   jpeg_component_info *compptr;
   JSAMPARRAY xbuf;
@@ -277,8 +277,8 @@ set_bottom_pointers (j_decompress_ptr cinfo)
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
     /* Count sample rows in one iMCU row and in one row group */
-    iMCUheight = compptr->v_samp_factor * compptr->DCT_scaled_size;
-    rgroup = iMCUheight / cinfo->min_DCT_scaled_size;
+    iMCUheight = compptr->v_samp_factor * compptr->DCT_v_scaled_size;
+    rgroup = iMCUheight / cinfo->min_DCT_v_scaled_size;
     /* Count nondummy sample rows remaining for this component */
     rows_left = (int) (compptr->downsampled_height % (JDIMENSION) iMCUheight);
     if (rows_left == 0) rows_left = iMCUheight;
@@ -286,12 +286,12 @@ set_bottom_pointers (j_decompress_ptr cinfo)
      * so we need only do it once.
      */
     if (ci == 0) {
-      main_ptr->rowgroups_avail = (JDIMENSION) ((rows_left-1) / rgroup + 1);
+      main->rowgroups_avail = (JDIMENSION) ((rows_left-1) / rgroup + 1);
     }
     /* Duplicate the last real sample row rgroup*2 times; this pads out the
      * last partial rowgroup and ensures at least one full rowgroup of context.
      */
-    xbuf = main_ptr->xbuffer[main_ptr->whichptr][ci];
+    xbuf = main->xbuffer[main->whichptr][ci];
     for (i = 0; i < rgroup * 2; i++) {
       xbuf[rows_left + i] = xbuf[rows_left-1];
     }
@@ -306,27 +306,27 @@ set_bottom_pointers (j_decompress_ptr cinfo)
 METHODDEF(void)
 start_pass_main (j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main = (my_main_ptr) cinfo->main;
 
   switch (pass_mode) {
   case JBUF_PASS_THRU:
     if (cinfo->upsample->need_context_rows) {
-      main_ptr->pub.process_data = process_data_context_main;
+      main->pub.process_data = process_data_context_main;
       make_funny_pointers(cinfo); /* Create the xbuffer[] lists */
-      main_ptr->whichptr = 0;	/* Read first iMCU row into xbuffer[0] */
-      main_ptr->context_state = CTX_PREPARE_FOR_IMCU;
-      main_ptr->iMCU_row_ctr = 0;
+      main->whichptr = 0;	/* Read first iMCU row into xbuffer[0] */
+      main->context_state = CTX_PREPARE_FOR_IMCU;
+      main->iMCU_row_ctr = 0;
     } else {
       /* Simple case with no context needed */
-      main_ptr->pub.process_data = process_data_simple_main;
+      main->pub.process_data = process_data_simple_main;
     }
-    main_ptr->buffer_full = FALSE;	/* Mark buffer empty */
-    main_ptr->rowgroup_ctr = 0;
+    main->buffer_full = FALSE;	/* Mark buffer empty */
+    main->rowgroup_ctr = 0;
     break;
 #ifdef QUANT_2PASS_SUPPORTED
   case JBUF_CRANK_DEST:
     /* For last pass of 2-pass quantization, just crank the postprocessor */
-    main_ptr->pub.process_data = process_data_crank_post;
+    main->pub.process_data = process_data_crank_post;
     break;
 #endif
   default:
@@ -346,32 +346,32 @@ process_data_simple_main (j_decompress_ptr cinfo,
 			  JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
 			  JDIMENSION out_rows_avail)
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main = (my_main_ptr) cinfo->main;
   JDIMENSION rowgroups_avail;
 
-  /* Read input data if we haven't filled the main_ptr buffer yet */
-  if (! main_ptr->buffer_full) {
-    if (! (*cinfo->coef->decompress_data) (cinfo, main_ptr->buffer))
+  /* Read input data if we haven't filled the main buffer yet */
+  if (! main->buffer_full) {
+    if (! (*cinfo->coef->decompress_data) (cinfo, main->buffer))
       return;			/* suspension forced, can do nothing more */
-    main_ptr->buffer_full = TRUE;	/* OK, we have an iMCU row to work with */
+    main->buffer_full = TRUE;	/* OK, we have an iMCU row to work with */
   }
 
   /* There are always min_DCT_scaled_size row groups in an iMCU row. */
-  rowgroups_avail = (JDIMENSION) cinfo->min_DCT_scaled_size;
+  rowgroups_avail = (JDIMENSION) cinfo->min_DCT_v_scaled_size;
   /* Note: at the bottom of the image, we may pass extra garbage row groups
    * to the postprocessor.  The postprocessor has to check for bottom
    * of image anyway (at row resolution), so no point in us doing it too.
    */
 
   /* Feed the postprocessor */
-  (*cinfo->post->post_process_data) (cinfo, main_ptr->buffer,
-				     &main_ptr->rowgroup_ctr, rowgroups_avail,
+  (*cinfo->post->post_process_data) (cinfo, main->buffer,
+				     &main->rowgroup_ctr, rowgroups_avail,
 				     output_buf, out_row_ctr, out_rows_avail);
 
   /* Has postprocessor consumed all the data yet? If so, mark buffer empty */
-  if (main_ptr->rowgroup_ctr >= rowgroups_avail) {
-    main_ptr->buffer_full = FALSE;
-    main_ptr->rowgroup_ctr = 0;
+  if (main->rowgroup_ctr >= rowgroups_avail) {
+    main->buffer_full = FALSE;
+    main->rowgroup_ctr = 0;
   }
 }
 
@@ -386,15 +386,15 @@ process_data_context_main (j_decompress_ptr cinfo,
 			   JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
 			   JDIMENSION out_rows_avail)
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main = (my_main_ptr) cinfo->main;
 
-  /* Read input data if we haven't filled the main_ptr buffer yet */
-  if (! main_ptr->buffer_full) {
+  /* Read input data if we haven't filled the main buffer yet */
+  if (! main->buffer_full) {
     if (! (*cinfo->coef->decompress_data) (cinfo,
-					   main_ptr->xbuffer[main_ptr->whichptr]))
+					   main->xbuffer[main->whichptr]))
       return;			/* suspension forced, can do nothing more */
-    main_ptr->buffer_full = TRUE;	/* OK, we have an iMCU row to work with */
-    main_ptr->iMCU_row_ctr++;	/* count rows received */
+    main->buffer_full = TRUE;	/* OK, we have an iMCU row to work with */
+    main->iMCU_row_ctr++;	/* count rows received */
   }
 
   /* Postprocessor typically will not swallow all the input data it is handed
@@ -402,47 +402,47 @@ process_data_context_main (j_decompress_ptr cinfo,
    * to exit and restart.  This switch lets us keep track of how far we got.
    * Note that each case falls through to the next on successful completion.
    */
-  switch (main_ptr->context_state) {
+  switch (main->context_state) {
   case CTX_POSTPONED_ROW:
     /* Call postprocessor using previously set pointers for postponed row */
-    (*cinfo->post->post_process_data) (cinfo, main_ptr->xbuffer[main_ptr->whichptr],
-			&main_ptr->rowgroup_ctr, main_ptr->rowgroups_avail,
+    (*cinfo->post->post_process_data) (cinfo, main->xbuffer[main->whichptr],
+			&main->rowgroup_ctr, main->rowgroups_avail,
 			output_buf, out_row_ctr, out_rows_avail);
-    if (main_ptr->rowgroup_ctr < main_ptr->rowgroups_avail)
+    if (main->rowgroup_ctr < main->rowgroups_avail)
       return;			/* Need to suspend */
-    main_ptr->context_state = CTX_PREPARE_FOR_IMCU;
+    main->context_state = CTX_PREPARE_FOR_IMCU;
     if (*out_row_ctr >= out_rows_avail)
       return;			/* Postprocessor exactly filled output buf */
     /*FALLTHROUGH*/
   case CTX_PREPARE_FOR_IMCU:
     /* Prepare to process first M-1 row groups of this iMCU row */
-    main_ptr->rowgroup_ctr = 0;
-    main_ptr->rowgroups_avail = (JDIMENSION) (cinfo->min_DCT_scaled_size - 1);
+    main->rowgroup_ctr = 0;
+    main->rowgroups_avail = (JDIMENSION) (cinfo->min_DCT_v_scaled_size - 1);
     /* Check for bottom of image: if so, tweak pointers to "duplicate"
      * the last sample row, and adjust rowgroups_avail to ignore padding rows.
      */
-    if (main_ptr->iMCU_row_ctr == cinfo->total_iMCU_rows)
+    if (main->iMCU_row_ctr == cinfo->total_iMCU_rows)
       set_bottom_pointers(cinfo);
-    main_ptr->context_state = CTX_PROCESS_IMCU;
+    main->context_state = CTX_PROCESS_IMCU;
     /*FALLTHROUGH*/
   case CTX_PROCESS_IMCU:
     /* Call postprocessor using previously set pointers */
-    (*cinfo->post->post_process_data) (cinfo, main_ptr->xbuffer[main_ptr->whichptr],
-			&main_ptr->rowgroup_ctr, main_ptr->rowgroups_avail,
+    (*cinfo->post->post_process_data) (cinfo, main->xbuffer[main->whichptr],
+			&main->rowgroup_ctr, main->rowgroups_avail,
 			output_buf, out_row_ctr, out_rows_avail);
-    if (main_ptr->rowgroup_ctr < main_ptr->rowgroups_avail)
+    if (main->rowgroup_ctr < main->rowgroups_avail)
       return;			/* Need to suspend */
     /* After the first iMCU, change wraparound pointers to normal state */
-    if (main_ptr->iMCU_row_ctr == 1)
+    if (main->iMCU_row_ctr == 1)
       set_wraparound_pointers(cinfo);
     /* Prepare to load new iMCU row using other xbuffer list */
-    main_ptr->whichptr ^= 1;	/* 0=>1 or 1=>0 */
-    main_ptr->buffer_full = FALSE;
+    main->whichptr ^= 1;	/* 0=>1 or 1=>0 */
+    main->buffer_full = FALSE;
     /* Still need to process last row group of this iMCU row, */
     /* which is saved at index M+1 of the other xbuffer */
-    main_ptr->rowgroup_ctr = (JDIMENSION) (cinfo->min_DCT_scaled_size + 1);
-    main_ptr->rowgroups_avail = (JDIMENSION) (cinfo->min_DCT_scaled_size + 2);
-    main_ptr->context_state = CTX_POSTPONED_ROW;
+    main->rowgroup_ctr = (JDIMENSION) (cinfo->min_DCT_v_scaled_size + 1);
+    main->rowgroups_avail = (JDIMENSION) (cinfo->min_DCT_v_scaled_size + 2);
+    main->context_state = CTX_POSTPONED_ROW;
   }
 }
 
@@ -469,21 +469,21 @@ process_data_crank_post (j_decompress_ptr cinfo,
 
 
 /*
- * Initialize main_ptr buffer controller.
+ * Initialize main buffer controller.
  */
 
 GLOBAL(void)
 jinit_d_main_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
 {
-  my_main_ptr main_ptr;
+  my_main_ptr main;
   int ci, rgroup, ngroups;
   jpeg_component_info *compptr;
 
-  main_ptr = (my_main_ptr)
+  main = (my_main_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
 				SIZEOF(my_main_controller));
-  cinfo->main = (struct jpeg_d_main_controller *) main_ptr;
-  main_ptr->pub.start_pass = start_pass_main;
+  cinfo->main = (struct jpeg_d_main_controller *) main;
+  main->pub.start_pass = start_pass_main;
 
   if (need_full_buffer)		/* shouldn't happen */
     ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
@@ -492,21 +492,21 @@ jinit_d_main_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
    * ngroups is the number of row groups we need.
    */
   if (cinfo->upsample->need_context_rows) {
-    if (cinfo->min_DCT_scaled_size < 2) /* unsupported, see comments above */
+    if (cinfo->min_DCT_v_scaled_size < 2) /* unsupported, see comments above */
       ERREXIT(cinfo, JERR_NOTIMPL);
     alloc_funny_pointers(cinfo); /* Alloc space for xbuffer[] lists */
-    ngroups = cinfo->min_DCT_scaled_size + 2;
+    ngroups = cinfo->min_DCT_v_scaled_size + 2;
   } else {
-    ngroups = cinfo->min_DCT_scaled_size;
+    ngroups = cinfo->min_DCT_v_scaled_size;
   }
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
-    rgroup = (compptr->v_samp_factor * compptr->DCT_scaled_size) /
-      cinfo->min_DCT_scaled_size; /* height of a row group of component */
-    main_ptr->buffer[ci] = (*cinfo->mem->alloc_sarray)
+    rgroup = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
+      cinfo->min_DCT_v_scaled_size; /* height of a row group of component */
+    main->buffer[ci] = (*cinfo->mem->alloc_sarray)
 			((j_common_ptr) cinfo, JPOOL_IMAGE,
-			 compptr->width_in_blocks * compptr->DCT_scaled_size,
+			 compptr->width_in_blocks * compptr->DCT_h_scaled_size,
 			 (JDIMENSION) (rgroup * ngroups));
   }
 }
diff --git a/3rdparty/libjpeg/jdmarker.c b/3rdparty/libjpeg/jdmarker.c
index f4cca8cc83..f2a9cc4295 100644
--- a/3rdparty/libjpeg/jdmarker.c
+++ b/3rdparty/libjpeg/jdmarker.c
@@ -2,6 +2,7 @@
  * jdmarker.c
  *
  * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Modified 2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -234,7 +235,8 @@ get_soi (j_decompress_ptr cinfo)
 
 
 LOCAL(boolean)
-get_sof (j_decompress_ptr cinfo, boolean is_prog, boolean is_arith)
+get_sof (j_decompress_ptr cinfo, boolean is_baseline, boolean is_prog,
+	 boolean is_arith)
 /* Process a SOFn marker */
 {
   INT32 length;
@@ -242,6 +244,7 @@ get_sof (j_decompress_ptr cinfo, boolean is_prog, boolean is_arith)
   jpeg_component_info * compptr;
   INPUT_VARS(cinfo);
 
+  cinfo->is_baseline = is_baseline;
   cinfo->progressive_mode = is_prog;
   cinfo->arith_code = is_arith;
 
@@ -315,7 +318,9 @@ get_sos (j_decompress_ptr cinfo)
 
   TRACEMS1(cinfo, 1, JTRC_SOS, n);
 
-  if (length != (n * 2 + 6) || n < 1 || n > MAX_COMPS_IN_SCAN)
+  if (length != (n * 2 + 6) || n > MAX_COMPS_IN_SCAN ||
+      (n == 0 && !cinfo->progressive_mode))
+      /* pseudo SOS marker only allowed in progressive mode */
     ERREXIT(cinfo, JERR_BAD_LENGTH);
 
   cinfo->comps_in_scan = n;
@@ -359,8 +364,8 @@ get_sos (j_decompress_ptr cinfo)
   /* Prepare to scan data & restart markers */
   cinfo->marker->next_restart_num = 0;
 
-  /* Count another SOS marker */
-  cinfo->input_scan_number++;
+  /* Count another (non-pseudo) SOS marker */
+  if (n) cinfo->input_scan_number++;
 
   INPUT_SYNC(cinfo);
   return TRUE;
@@ -490,16 +495,18 @@ LOCAL(boolean)
 get_dqt (j_decompress_ptr cinfo)
 /* Process a DQT marker */
 {
-  INT32 length;
-  int n, i, prec;
+  INT32 length, count, i;
+  int n, prec;
   unsigned int tmp;
   JQUANT_TBL *quant_ptr;
+  const int *natural_order;
   INPUT_VARS(cinfo);
 
   INPUT_2BYTES(cinfo, length, return FALSE);
   length -= 2;
 
   while (length > 0) {
+    length--;
     INPUT_BYTE(cinfo, n, return FALSE);
     prec = n >> 4;
     n &= 0x0F;
@@ -513,13 +520,43 @@ get_dqt (j_decompress_ptr cinfo)
       cinfo->quant_tbl_ptrs[n] = jpeg_alloc_quant_table((j_common_ptr) cinfo);
     quant_ptr = cinfo->quant_tbl_ptrs[n];
 
-    for (i = 0; i < DCTSIZE2; i++) {
+    if (prec) {
+      if (length < DCTSIZE2 * 2) {
+	/* Initialize full table for safety. */
+	for (i = 0; i < DCTSIZE2; i++) {
+	  quant_ptr->quantval[i] = 1;
+	}
+	count = length >> 1;
+      } else
+	count = DCTSIZE2;
+    } else {
+      if (length < DCTSIZE2) {
+	/* Initialize full table for safety. */
+	for (i = 0; i < DCTSIZE2; i++) {
+	  quant_ptr->quantval[i] = 1;
+	}
+	count = length;
+      } else
+	count = DCTSIZE2;
+    }
+
+    switch (count) {
+    case (2*2): natural_order = jpeg_natural_order2; break;
+    case (3*3): natural_order = jpeg_natural_order3; break;
+    case (4*4): natural_order = jpeg_natural_order4; break;
+    case (5*5): natural_order = jpeg_natural_order5; break;
+    case (6*6): natural_order = jpeg_natural_order6; break;
+    case (7*7): natural_order = jpeg_natural_order7; break;
+    default:    natural_order = jpeg_natural_order;  break;
+    }
+
+    for (i = 0; i < count; i++) {
       if (prec)
 	INPUT_2BYTES(cinfo, tmp, return FALSE);
       else
 	INPUT_BYTE(cinfo, tmp, return FALSE);
       /* We convert the zigzag-order table to natural array order. */
-      quant_ptr->quantval[jpeg_natural_order[i]] = (UINT16) tmp;
+      quant_ptr->quantval[natural_order[i]] = (UINT16) tmp;
     }
 
     if (cinfo->err->trace_level >= 2) {
@@ -532,8 +569,8 @@ get_dqt (j_decompress_ptr cinfo)
       }
     }
 
-    length -= DCTSIZE2+1;
-    if (prec) length -= DCTSIZE2;
+    length -= count;
+    if (prec) length -= count;
   }
 
   if (length != 0)
@@ -946,6 +983,11 @@ first_marker (j_decompress_ptr cinfo)
  *
  * Returns same codes as are defined for jpeg_consume_input:
  * JPEG_SUSPENDED, JPEG_REACHED_SOS, or JPEG_REACHED_EOI.
+ *
+ * Note: This function may return a pseudo SOS marker (with zero
+ * component number) for treat by input controller's consume_input.
+ * consume_input itself should filter out (skip) the pseudo marker
+ * after processing for the caller.
  */
 
 METHODDEF(int)
@@ -975,23 +1017,27 @@ read_markers (j_decompress_ptr cinfo)
       break;
 
     case M_SOF0:		/* Baseline */
+      if (! get_sof(cinfo, TRUE, FALSE, FALSE))
+	return JPEG_SUSPENDED;
+      break;
+
     case M_SOF1:		/* Extended sequential, Huffman */
-      if (! get_sof(cinfo, FALSE, FALSE))
+      if (! get_sof(cinfo, FALSE, FALSE, FALSE))
 	return JPEG_SUSPENDED;
       break;
 
     case M_SOF2:		/* Progressive, Huffman */
-      if (! get_sof(cinfo, TRUE, FALSE))
+      if (! get_sof(cinfo, FALSE, TRUE, FALSE))
 	return JPEG_SUSPENDED;
       break;
 
     case M_SOF9:		/* Extended sequential, arithmetic */
-      if (! get_sof(cinfo, FALSE, TRUE))
+      if (! get_sof(cinfo, FALSE, FALSE, TRUE))
 	return JPEG_SUSPENDED;
       break;
 
     case M_SOF10:		/* Progressive, arithmetic */
-      if (! get_sof(cinfo, TRUE, TRUE))
+      if (! get_sof(cinfo, FALSE, TRUE, TRUE))
 	return JPEG_SUSPENDED;
       break;
 
diff --git a/3rdparty/libjpeg/jdmaster.c b/3rdparty/libjpeg/jdmaster.c
index 2802c5b7b2..fef72a21b4 100644
--- a/3rdparty/libjpeg/jdmaster.c
+++ b/3rdparty/libjpeg/jdmaster.c
@@ -2,6 +2,7 @@
  * jdmaster.c
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2002-2011 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -61,9 +62,12 @@ use_merged_upsample (j_decompress_ptr cinfo)
       cinfo->comp_info[2].v_samp_factor != 1)
     return FALSE;
   /* furthermore, it doesn't work if we've scaled the IDCTs differently */
-  if (cinfo->comp_info[0].DCT_scaled_size != cinfo->min_DCT_scaled_size ||
-      cinfo->comp_info[1].DCT_scaled_size != cinfo->min_DCT_scaled_size ||
-      cinfo->comp_info[2].DCT_scaled_size != cinfo->min_DCT_scaled_size)
+  if (cinfo->comp_info[0].DCT_h_scaled_size != cinfo->min_DCT_h_scaled_size ||
+      cinfo->comp_info[1].DCT_h_scaled_size != cinfo->min_DCT_h_scaled_size ||
+      cinfo->comp_info[2].DCT_h_scaled_size != cinfo->min_DCT_h_scaled_size ||
+      cinfo->comp_info[0].DCT_v_scaled_size != cinfo->min_DCT_v_scaled_size ||
+      cinfo->comp_info[1].DCT_v_scaled_size != cinfo->min_DCT_v_scaled_size ||
+      cinfo->comp_info[2].DCT_v_scaled_size != cinfo->min_DCT_v_scaled_size)
     return FALSE;
   /* ??? also need to test for upsample-time rescaling, when & if supported */
   return TRUE;			/* by golly, it'll work... */
@@ -82,7 +86,9 @@ use_merged_upsample (j_decompress_ptr cinfo)
 
 GLOBAL(void)
 jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
-/* Do computations that are needed before master selection phase */
+/* Do computations that are needed before master selection phase.
+ * This function is used for full decompression.
+ */
 {
 #ifdef IDCT_SCALING_SUPPORTED
   int ci;
@@ -93,52 +99,38 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
   if (cinfo->global_state != DSTATE_READY)
     ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
 
+  /* Compute core output image dimensions and DCT scaling choices. */
+  jpeg_core_output_dimensions(cinfo);
+
 #ifdef IDCT_SCALING_SUPPORTED
 
-  /* Compute actual output image dimensions and DCT scaling choices. */
-  if (cinfo->scale_num * 8 <= cinfo->scale_denom) {
-    /* Provide 1/8 scaling */
-    cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width, 8L);
-    cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height, 8L);
-    cinfo->min_DCT_scaled_size = 1;
-  } else if (cinfo->scale_num * 4 <= cinfo->scale_denom) {
-    /* Provide 1/4 scaling */
-    cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width, 4L);
-    cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height, 4L);
-    cinfo->min_DCT_scaled_size = 2;
-  } else if (cinfo->scale_num * 2 <= cinfo->scale_denom) {
-    /* Provide 1/2 scaling */
-    cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width, 2L);
-    cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height, 2L);
-    cinfo->min_DCT_scaled_size = 4;
-  } else {
-    /* Provide 1/1 scaling */
-    cinfo->output_width = cinfo->image_width;
-    cinfo->output_height = cinfo->image_height;
-    cinfo->min_DCT_scaled_size = DCTSIZE;
-  }
   /* In selecting the actual DCT scaling for each component, we try to
    * scale up the chroma components via IDCT scaling rather than upsampling.
    * This saves time if the upsampler gets to use 1:1 scaling.
-   * Note this code assumes that the supported DCT scalings are powers of 2.
+   * Note this code adapts subsampling ratios which are powers of 2.
    */
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
-    int ssize = cinfo->min_DCT_scaled_size;
-    while (ssize < DCTSIZE &&
-	   (compptr->h_samp_factor * ssize * 2 <=
-	    cinfo->max_h_samp_factor * cinfo->min_DCT_scaled_size) &&
-	   (compptr->v_samp_factor * ssize * 2 <=
-	    cinfo->max_v_samp_factor * cinfo->min_DCT_scaled_size)) {
+    int ssize = 1;
+    while (cinfo->min_DCT_h_scaled_size * ssize <=
+	   (cinfo->do_fancy_upsampling ? DCTSIZE : DCTSIZE / 2) &&
+	   (cinfo->max_h_samp_factor % (compptr->h_samp_factor * ssize * 2)) == 0) {
       ssize = ssize * 2;
     }
-    compptr->DCT_scaled_size = ssize;
+    compptr->DCT_h_scaled_size = cinfo->min_DCT_h_scaled_size * ssize;
+    ssize = 1;
+    while (cinfo->min_DCT_v_scaled_size * ssize <=
+	   (cinfo->do_fancy_upsampling ? DCTSIZE : DCTSIZE / 2) &&
+	   (cinfo->max_v_samp_factor % (compptr->v_samp_factor * ssize * 2)) == 0) {
+      ssize = ssize * 2;
+    }
+    compptr->DCT_v_scaled_size = cinfo->min_DCT_v_scaled_size * ssize;
+
+    /* We don't support IDCT ratios larger than 2. */
+    if (compptr->DCT_h_scaled_size > compptr->DCT_v_scaled_size * 2)
+	compptr->DCT_h_scaled_size = compptr->DCT_v_scaled_size * 2;
+    else if (compptr->DCT_v_scaled_size > compptr->DCT_h_scaled_size * 2)
+	compptr->DCT_v_scaled_size = compptr->DCT_h_scaled_size * 2;
   }
 
   /* Recompute downsampled dimensions of components;
@@ -149,23 +141,14 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
     /* Size in samples, after IDCT scaling */
     compptr->downsampled_width = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_width *
-		    (long) (compptr->h_samp_factor * compptr->DCT_scaled_size),
-		    (long) (cinfo->max_h_samp_factor * DCTSIZE));
+		    (long) (compptr->h_samp_factor * compptr->DCT_h_scaled_size),
+		    (long) (cinfo->max_h_samp_factor * cinfo->block_size));
     compptr->downsampled_height = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_height *
-		    (long) (compptr->v_samp_factor * compptr->DCT_scaled_size),
-		    (long) (cinfo->max_v_samp_factor * DCTSIZE));
+		    (long) (compptr->v_samp_factor * compptr->DCT_v_scaled_size),
+		    (long) (cinfo->max_v_samp_factor * cinfo->block_size));
   }
 
-#else /* !IDCT_SCALING_SUPPORTED */
-
-  /* Hardwire it to "no scaling" */
-  cinfo->output_width = cinfo->image_width;
-  cinfo->output_height = cinfo->image_height;
-  /* jdinput.c has already initialized DCT_scaled_size to DCTSIZE,
-   * and has computed unscaled downsampled_width and downsampled_height.
-   */
-
 #endif /* IDCT_SCALING_SUPPORTED */
 
   /* Report number of components in selected colorspace. */
@@ -175,10 +158,8 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
     cinfo->out_color_components = 1;
     break;
   case JCS_RGB:
-#if RGB_PIXELSIZE != 3
     cinfo->out_color_components = RGB_PIXELSIZE;
     break;
-#endif /* else share code with YCbCr */
   case JCS_YCbCr:
     cinfo->out_color_components = 3;
     break;
@@ -372,17 +353,10 @@ master_selection (j_decompress_ptr cinfo)
   /* Inverse DCT */
   jinit_inverse_dct(cinfo);
   /* Entropy decoding: either Huffman or arithmetic coding. */
-  if (cinfo->arith_code) {
-    ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
-  } else {
-    if (cinfo->progressive_mode) {
-#ifdef D_PROGRESSIVE_SUPPORTED
-      jinit_phuff_decoder(cinfo);
-#else
-      ERREXIT(cinfo, JERR_NOT_COMPILED);
-#endif
-    } else
-      jinit_huff_decoder(cinfo);
+  if (cinfo->arith_code)
+    jinit_arith_decoder(cinfo);
+  else {
+    jinit_huff_decoder(cinfo);
   }
 
   /* Initialize principal buffer controllers. */
diff --git a/3rdparty/libjpeg/jdphuff.c b/3rdparty/libjpeg/jdphuff.c
deleted file mode 100644
index 2267809945..0000000000
--- a/3rdparty/libjpeg/jdphuff.c
+++ /dev/null
@@ -1,668 +0,0 @@
-/*
- * jdphuff.c
- *
- * Copyright (C) 1995-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file contains Huffman entropy decoding routines for progressive JPEG.
- *
- * Much of the complexity here has to do with supporting input suspension.
- * If the data source module demands suspension, we want to be able to back
- * up to the start of the current MCU.  To do this, we copy state variables
- * into local working storage, and update them back to the permanent
- * storage only upon successful completion of an MCU.
- */
-
-#define JPEG_INTERNALS
-#include "jinclude.h"
-#include "jpeglib.h"
-#include "jdhuff.h"		/* Declarations shared with jdhuff.c */
-
-
-#ifdef D_PROGRESSIVE_SUPPORTED
-
-/*
- * Expanded entropy decoder object for progressive Huffman decoding.
- *
- * The savable_state subrecord contains fields that change within an MCU,
- * but must not be updated permanently until we complete the MCU.
- */
-
-typedef struct {
-  unsigned int EOBRUN;			/* remaining EOBs in EOBRUN */
-  int last_dc_val[MAX_COMPS_IN_SCAN];	/* last DC coef for each component */
-} savable_state;
-
-/* This macro is to work around compilers with missing or broken
- * structure assignment.  You'll need to fix this code if you have
- * such a compiler and you change MAX_COMPS_IN_SCAN.
- */
-
-#ifndef NO_STRUCT_ASSIGN
-#define ASSIGN_STATE(dest,src)  ((dest) = (src))
-#else
-#if MAX_COMPS_IN_SCAN == 4
-#define ASSIGN_STATE(dest,src)  \
-	((dest).EOBRUN = (src).EOBRUN, \
-	 (dest).last_dc_val[0] = (src).last_dc_val[0], \
-	 (dest).last_dc_val[1] = (src).last_dc_val[1], \
-	 (dest).last_dc_val[2] = (src).last_dc_val[2], \
-	 (dest).last_dc_val[3] = (src).last_dc_val[3])
-#endif
-#endif
-
-
-typedef struct {
-  struct jpeg_entropy_decoder pub; /* public fields */
-
-  /* These fields are loaded into local variables at start of each MCU.
-   * In case of suspension, we exit WITHOUT updating them.
-   */
-  bitread_perm_state bitstate;	/* Bit buffer at start of MCU */
-  savable_state saved;		/* Other state at start of MCU */
-
-  /* These fields are NOT loaded into local working state. */
-  unsigned int restarts_to_go;	/* MCUs left in this restart interval */
-
-  /* Pointers to derived tables (these workspaces have image lifespan) */
-  d_derived_tbl * derived_tbls[NUM_HUFF_TBLS];
-
-  d_derived_tbl * ac_derived_tbl; /* active table during an AC scan */
-} phuff_entropy_decoder;
-
-typedef phuff_entropy_decoder * phuff_entropy_ptr;
-
-/* Forward declarations */
-METHODDEF(boolean) decode_mcu_DC_first JPP((j_decompress_ptr cinfo,
-					    JBLOCKROW *MCU_data));
-METHODDEF(boolean) decode_mcu_AC_first JPP((j_decompress_ptr cinfo,
-					    JBLOCKROW *MCU_data));
-METHODDEF(boolean) decode_mcu_DC_refine JPP((j_decompress_ptr cinfo,
-					     JBLOCKROW *MCU_data));
-METHODDEF(boolean) decode_mcu_AC_refine JPP((j_decompress_ptr cinfo,
-					     JBLOCKROW *MCU_data));
-
-
-/*
- * Initialize for a Huffman-compressed scan.
- */
-
-METHODDEF(void)
-start_pass_phuff_decoder (j_decompress_ptr cinfo)
-{
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  boolean is_DC_band, bad;
-  int ci, coefi, tbl;
-  int *coef_bit_ptr;
-  jpeg_component_info * compptr;
-
-  is_DC_band = (cinfo->Ss == 0);
-
-  /* Validate scan parameters */
-  bad = FALSE;
-  if (is_DC_band) {
-    if (cinfo->Se != 0)
-      bad = TRUE;
-  } else {
-    /* need not check Ss/Se < 0 since they came from unsigned bytes */
-    if (cinfo->Ss > cinfo->Se || cinfo->Se >= DCTSIZE2)
-      bad = TRUE;
-    /* AC scans may have only one component */
-    if (cinfo->comps_in_scan != 1)
-      bad = TRUE;
-  }
-  if (cinfo->Ah != 0) {
-    /* Successive approximation refinement scan: must have Al = Ah-1. */
-    if (cinfo->Al != cinfo->Ah-1)
-      bad = TRUE;
-  }
-  if (cinfo->Al > 13)		/* need not check for < 0 */
-    bad = TRUE;
-  /* Arguably the maximum Al value should be less than 13 for 8-bit precision,
-   * but the spec doesn't say so, and we try to be liberal about what we
-   * accept.  Note: large Al values could result in out-of-range DC
-   * coefficients during early scans, leading to bizarre displays due to
-   * overflows in the IDCT math.  But we won't crash.
-   */
-  if (bad)
-    ERREXIT4(cinfo, JERR_BAD_PROGRESSION,
-	     cinfo->Ss, cinfo->Se, cinfo->Ah, cinfo->Al);
-  /* Update progression status, and verify that scan order is legal.
-   * Note that inter-scan inconsistencies are treated as warnings
-   * not fatal errors ... not clear if this is right way to behave.
-   */
-  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
-    int cindex = cinfo->cur_comp_info[ci]->component_index;
-    coef_bit_ptr = & cinfo->coef_bits[cindex][0];
-    if (!is_DC_band && coef_bit_ptr[0] < 0) /* AC without prior DC scan */
-      WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0);
-    for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) {
-      int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi];
-      if (cinfo->Ah != expected)
-	WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, coefi);
-      coef_bit_ptr[coefi] = cinfo->Al;
-    }
-  }
-
-  /* Select MCU decoding routine */
-  if (cinfo->Ah == 0) {
-    if (is_DC_band)
-      entropy->pub.decode_mcu = decode_mcu_DC_first;
-    else
-      entropy->pub.decode_mcu = decode_mcu_AC_first;
-  } else {
-    if (is_DC_band)
-      entropy->pub.decode_mcu = decode_mcu_DC_refine;
-    else
-      entropy->pub.decode_mcu = decode_mcu_AC_refine;
-  }
-
-  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
-    compptr = cinfo->cur_comp_info[ci];
-    /* Make sure requested tables are present, and compute derived tables.
-     * We may build same derived table more than once, but it's not expensive.
-     */
-    if (is_DC_band) {
-      if (cinfo->Ah == 0) {	/* DC refinement needs no table */
-	tbl = compptr->dc_tbl_no;
-	jpeg_make_d_derived_tbl(cinfo, TRUE, tbl,
-				& entropy->derived_tbls[tbl]);
-      }
-    } else {
-      tbl = compptr->ac_tbl_no;
-      jpeg_make_d_derived_tbl(cinfo, FALSE, tbl,
-			      & entropy->derived_tbls[tbl]);
-      /* remember the single active table */
-      entropy->ac_derived_tbl = entropy->derived_tbls[tbl];
-    }
-    /* Initialize DC predictions to 0 */
-    entropy->saved.last_dc_val[ci] = 0;
-  }
-
-  /* Initialize bitread state variables */
-  entropy->bitstate.bits_left = 0;
-  entropy->bitstate.get_buffer = 0; /* unnecessary, but keeps Purify quiet */
-  entropy->pub.insufficient_data = FALSE;
-
-  /* Initialize private state variables */
-  entropy->saved.EOBRUN = 0;
-
-  /* Initialize restart counter */
-  entropy->restarts_to_go = cinfo->restart_interval;
-}
-
-
-/*
- * Figure F.12: extend sign bit.
- * On some machines, a shift and add will be faster than a table lookup.
- */
-
-#ifdef AVOID_TABLES
-
-#define HUFF_EXTEND(x,s)  ((x) < (1<<((s)-1)) ? (x) + (((-1)<<(s)) + 1) : (x))
-
-#else
-
-#define HUFF_EXTEND(x,s)  ((x) < extend_test[s] ? (x) + extend_offset[s] : (x))
-
-static const int extend_test[16] =   /* entry n is 2**(n-1) */
-  { 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
-    0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000 };
-
-static const int extend_offset[16] = /* entry n is (-1 << n) + 1 */
-  { 0, ((-1)<<1) + 1, ((-1)<<2) + 1, ((-1)<<3) + 1, ((-1)<<4) + 1,
-    ((-1)<<5) + 1, ((-1)<<6) + 1, ((-1)<<7) + 1, ((-1)<<8) + 1,
-    ((-1)<<9) + 1, ((-1)<<10) + 1, ((-1)<<11) + 1, ((-1)<<12) + 1,
-    ((-1)<<13) + 1, ((-1)<<14) + 1, ((-1)<<15) + 1 };
-
-#endif /* AVOID_TABLES */
-
-
-/*
- * Check for a restart marker & resynchronize decoder.
- * Returns FALSE if must suspend.
- */
-
-LOCAL(boolean)
-process_restart (j_decompress_ptr cinfo)
-{
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  int ci;
-
-  /* Throw away any unused bits remaining in bit buffer; */
-  /* include any full bytes in next_marker's count of discarded bytes */
-  cinfo->marker->discarded_bytes += entropy->bitstate.bits_left / 8;
-  entropy->bitstate.bits_left = 0;
-
-  /* Advance past the RSTn marker */
-  if (! (*cinfo->marker->read_restart_marker) (cinfo))
-    return FALSE;
-
-  /* Re-initialize DC predictions to 0 */
-  for (ci = 0; ci < cinfo->comps_in_scan; ci++)
-    entropy->saved.last_dc_val[ci] = 0;
-  /* Re-init EOB run count, too */
-  entropy->saved.EOBRUN = 0;
-
-  /* Reset restart counter */
-  entropy->restarts_to_go = cinfo->restart_interval;
-
-  /* Reset out-of-data flag, unless read_restart_marker left us smack up
-   * against a marker.  In that case we will end up treating the next data
-   * segment as empty, and we can avoid producing bogus output pixels by
-   * leaving the flag set.
-   */
-  if (cinfo->unread_marker == 0)
-    entropy->pub.insufficient_data = FALSE;
-
-  return TRUE;
-}
-
-
-/*
- * Huffman MCU decoding.
- * Each of these routines decodes and returns one MCU's worth of
- * Huffman-compressed coefficients. 
- * The coefficients are reordered from zigzag order into natural array order,
- * but are not dequantized.
- *
- * The i'th block of the MCU is stored into the block pointed to by
- * MCU_data[i].  WE ASSUME THIS AREA IS INITIALLY ZEROED BY THE CALLER.
- *
- * We return FALSE if data source requested suspension.  In that case no
- * changes have been made to permanent state.  (Exception: some output
- * coefficients may already have been assigned.  This is harmless for
- * spectral selection, since we'll just re-assign them on the next call.
- * Successive approximation AC refinement has to be more careful, however.)
- */
-
-/*
- * MCU decoding for DC initial scan (either spectral selection,
- * or first pass of successive approximation).
- */
-
-METHODDEF(boolean)
-decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
-{   
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  int Al = cinfo->Al;
-  register int s, r;
-  int blkn, ci;
-  JBLOCKROW block;
-  BITREAD_STATE_VARS;
-  savable_state state;
-  d_derived_tbl * tbl;
-  jpeg_component_info * compptr;
-
-  /* Process restart marker if needed; may have to suspend */
-  if (cinfo->restart_interval) {
-    if (entropy->restarts_to_go == 0)
-      if (! process_restart(cinfo))
-	return FALSE;
-  }
-
-  /* If we've run out of data, just leave the MCU set to zeroes.
-   * This way, we return uniform gray for the remainder of the segment.
-   */
-  if (! entropy->pub.insufficient_data) {
-
-    /* Load up working state */
-    BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
-    ASSIGN_STATE(state, entropy->saved);
-
-    /* Outer loop handles each block in the MCU */
-
-    for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
-      block = MCU_data[blkn];
-      ci = cinfo->MCU_membership[blkn];
-      compptr = cinfo->cur_comp_info[ci];
-      tbl = entropy->derived_tbls[compptr->dc_tbl_no];
-
-      /* Decode a single block's worth of coefficients */
-
-      /* Section F.2.2.1: decode the DC coefficient difference */
-      HUFF_DECODE(s, br_state, tbl, return FALSE, label1);
-      if (s) {
-	CHECK_BIT_BUFFER(br_state, s, return FALSE);
-	r = GET_BITS(s);
-	s = HUFF_EXTEND(r, s);
-      }
-
-      /* Convert DC difference to actual value, update last_dc_val */
-      s += state.last_dc_val[ci];
-      state.last_dc_val[ci] = s;
-      /* Scale and output the coefficient (assumes jpeg_natural_order[0]=0) */
-      (*block)[0] = (JCOEF) (s << Al);
-    }
-
-    /* Completed MCU, so update state */
-    BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
-    ASSIGN_STATE(entropy->saved, state);
-  }
-
-  /* Account for restart interval (no-op if not using restarts) */
-  entropy->restarts_to_go--;
-
-  return TRUE;
-}
-
-
-/*
- * MCU decoding for AC initial scan (either spectral selection,
- * or first pass of successive approximation).
- */
-
-METHODDEF(boolean)
-decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
-{   
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  int Se = cinfo->Se;
-  int Al = cinfo->Al;
-  register int s, k, r;
-  unsigned int EOBRUN;
-  JBLOCKROW block;
-  BITREAD_STATE_VARS;
-  d_derived_tbl * tbl;
-
-  /* Process restart marker if needed; may have to suspend */
-  if (cinfo->restart_interval) {
-    if (entropy->restarts_to_go == 0)
-      if (! process_restart(cinfo))
-	return FALSE;
-  }
-
-  /* If we've run out of data, just leave the MCU set to zeroes.
-   * This way, we return uniform gray for the remainder of the segment.
-   */
-  if (! entropy->pub.insufficient_data) {
-
-    /* Load up working state.
-     * We can avoid loading/saving bitread state if in an EOB run.
-     */
-    EOBRUN = entropy->saved.EOBRUN;	/* only part of saved state we need */
-
-    /* There is always only one block per MCU */
-
-    if (EOBRUN > 0)		/* if it's a band of zeroes... */
-      EOBRUN--;			/* ...process it now (we do nothing) */
-    else {
-      BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
-      block = MCU_data[0];
-      tbl = entropy->ac_derived_tbl;
-
-      for (k = cinfo->Ss; k <= Se; k++) {
-	HUFF_DECODE(s, br_state, tbl, return FALSE, label2);
-	r = s >> 4;
-	s &= 15;
-	if (s) {
-	  k += r;
-	  CHECK_BIT_BUFFER(br_state, s, return FALSE);
-	  r = GET_BITS(s);
-	  s = HUFF_EXTEND(r, s);
-	  /* Scale and output coefficient in natural (dezigzagged) order */
-	  (*block)[jpeg_natural_order[k]] = (JCOEF) (s << Al);
-	} else {
-	  if (r == 15) {	/* ZRL */
-	    k += 15;		/* skip 15 zeroes in band */
-	  } else {		/* EOBr, run length is 2^r + appended bits */
-	    EOBRUN = 1 << r;
-	    if (r) {		/* EOBr, r > 0 */
-	      CHECK_BIT_BUFFER(br_state, r, return FALSE);
-	      r = GET_BITS(r);
-	      EOBRUN += r;
-	    }
-	    EOBRUN--;		/* this band is processed at this moment */
-	    break;		/* force end-of-band */
-	  }
-	}
-      }
-
-      BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
-    }
-
-    /* Completed MCU, so update state */
-    entropy->saved.EOBRUN = EOBRUN;	/* only part of saved state we need */
-  }
-
-  /* Account for restart interval (no-op if not using restarts) */
-  entropy->restarts_to_go--;
-
-  return TRUE;
-}
-
-
-/*
- * MCU decoding for DC successive approximation refinement scan.
- * Note: we assume such scans can be multi-component, although the spec
- * is not very clear on the point.
- */
-
-METHODDEF(boolean)
-decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
-{   
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  int p1 = 1 << cinfo->Al;	/* 1 in the bit position being coded */
-  int blkn;
-  JBLOCKROW block;
-  BITREAD_STATE_VARS;
-
-  /* Process restart marker if needed; may have to suspend */
-  if (cinfo->restart_interval) {
-    if (entropy->restarts_to_go == 0)
-      if (! process_restart(cinfo))
-	return FALSE;
-  }
-
-  /* Not worth the cycles to check insufficient_data here,
-   * since we will not change the data anyway if we read zeroes.
-   */
-
-  /* Load up working state */
-  BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
-
-  /* Outer loop handles each block in the MCU */
-
-  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
-    block = MCU_data[blkn];
-
-    /* Encoded data is simply the next bit of the two's-complement DC value */
-    CHECK_BIT_BUFFER(br_state, 1, return FALSE);
-    if (GET_BITS(1))
-      (*block)[0] |= p1;
-    /* Note: since we use |=, repeating the assignment later is safe */
-  }
-
-  /* Completed MCU, so update state */
-  BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
-
-  /* Account for restart interval (no-op if not using restarts) */
-  entropy->restarts_to_go--;
-
-  return TRUE;
-}
-
-
-/*
- * MCU decoding for AC successive approximation refinement scan.
- */
-
-METHODDEF(boolean)
-decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
-{   
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  int Se = cinfo->Se;
-  int p1 = 1 << cinfo->Al;	/* 1 in the bit position being coded */
-  int m1 = (-1) << cinfo->Al;	/* -1 in the bit position being coded */
-  register int s, k, r;
-  unsigned int EOBRUN;
-  JBLOCKROW block;
-  JCOEFPTR thiscoef;
-  BITREAD_STATE_VARS;
-  d_derived_tbl * tbl;
-  int num_newnz;
-  int newnz_pos[DCTSIZE2];
-
-  /* Process restart marker if needed; may have to suspend */
-  if (cinfo->restart_interval) {
-    if (entropy->restarts_to_go == 0)
-      if (! process_restart(cinfo))
-	return FALSE;
-  }
-
-  /* If we've run out of data, don't modify the MCU.
-   */
-  if (! entropy->pub.insufficient_data) {
-
-    /* Load up working state */
-    BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
-    EOBRUN = entropy->saved.EOBRUN; /* only part of saved state we need */
-
-    /* There is always only one block per MCU */
-    block = MCU_data[0];
-    tbl = entropy->ac_derived_tbl;
-
-    /* If we are forced to suspend, we must undo the assignments to any newly
-     * nonzero coefficients in the block, because otherwise we'd get confused
-     * next time about which coefficients were already nonzero.
-     * But we need not undo addition of bits to already-nonzero coefficients;
-     * instead, we can test the current bit to see if we already did it.
-     */
-    num_newnz = 0;
-
-    /* initialize coefficient loop counter to start of band */
-    k = cinfo->Ss;
-
-    if (EOBRUN == 0) {
-      for (; k <= Se; k++) {
-	HUFF_DECODE(s, br_state, tbl, goto undoit, label3);
-	r = s >> 4;
-	s &= 15;
-	if (s) {
-	  if (s != 1)		/* size of new coef should always be 1 */
-	    WARNMS(cinfo, JWRN_HUFF_BAD_CODE);
-	  CHECK_BIT_BUFFER(br_state, 1, goto undoit);
-	  if (GET_BITS(1))
-	    s = p1;		/* newly nonzero coef is positive */
-	  else
-	    s = m1;		/* newly nonzero coef is negative */
-	} else {
-	  if (r != 15) {
-	    EOBRUN = 1 << r;	/* EOBr, run length is 2^r + appended bits */
-	    if (r) {
-	      CHECK_BIT_BUFFER(br_state, r, goto undoit);
-	      r = GET_BITS(r);
-	      EOBRUN += r;
-	    }
-	    break;		/* rest of block is handled by EOB logic */
-	  }
-	  /* note s = 0 for processing ZRL */
-	}
-	/* Advance over already-nonzero coefs and r still-zero coefs,
-	 * appending correction bits to the nonzeroes.  A correction bit is 1
-	 * if the absolute value of the coefficient must be increased.
-	 */
-	do {
-	  thiscoef = *block + jpeg_natural_order[k];
-	  if (*thiscoef != 0) {
-	    CHECK_BIT_BUFFER(br_state, 1, goto undoit);
-	    if (GET_BITS(1)) {
-	      if ((*thiscoef & p1) == 0) { /* do nothing if already set it */
-		if (*thiscoef >= 0)
-		  *thiscoef += p1;
-		else
-		  *thiscoef += m1;
-	      }
-	    }
-	  } else {
-	    if (--r < 0)
-	      break;		/* reached target zero coefficient */
-	  }
-	  k++;
-	} while (k <= Se);
-	if (s) {
-	  int pos = jpeg_natural_order[k];
-	  /* Output newly nonzero coefficient */
-	  (*block)[pos] = (JCOEF) s;
-	  /* Remember its position in case we have to suspend */
-	  newnz_pos[num_newnz++] = pos;
-	}
-      }
-    }
-
-    if (EOBRUN > 0) {
-      /* Scan any remaining coefficient positions after the end-of-band
-       * (the last newly nonzero coefficient, if any).  Append a correction
-       * bit to each already-nonzero coefficient.  A correction bit is 1
-       * if the absolute value of the coefficient must be increased.
-       */
-      for (; k <= Se; k++) {
-	thiscoef = *block + jpeg_natural_order[k];
-	if (*thiscoef != 0) {
-	  CHECK_BIT_BUFFER(br_state, 1, goto undoit);
-	  if (GET_BITS(1)) {
-	    if ((*thiscoef & p1) == 0) { /* do nothing if already changed it */
-	      if (*thiscoef >= 0)
-		*thiscoef += p1;
-	      else
-		*thiscoef += m1;
-	    }
-	  }
-	}
-      }
-      /* Count one block completed in EOB run */
-      EOBRUN--;
-    }
-
-    /* Completed MCU, so update state */
-    BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
-    entropy->saved.EOBRUN = EOBRUN; /* only part of saved state we need */
-  }
-
-  /* Account for restart interval (no-op if not using restarts) */
-  entropy->restarts_to_go--;
-
-  return TRUE;
-
-undoit:
-  /* Re-zero any output coefficients that we made newly nonzero */
-  while (num_newnz > 0)
-    (*block)[newnz_pos[--num_newnz]] = 0;
-
-  return FALSE;
-}
-
-
-/*
- * Module initialization routine for progressive Huffman entropy decoding.
- */
-
-GLOBAL(void)
-jinit_phuff_decoder (j_decompress_ptr cinfo)
-{
-  phuff_entropy_ptr entropy;
-  int *coef_bit_ptr;
-  int ci, i;
-
-  entropy = (phuff_entropy_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(phuff_entropy_decoder));
-  cinfo->entropy = (struct jpeg_entropy_decoder *) entropy;
-  entropy->pub.start_pass = start_pass_phuff_decoder;
-
-  /* Mark derived tables unallocated */
-  for (i = 0; i < NUM_HUFF_TBLS; i++) {
-    entropy->derived_tbls[i] = NULL;
-  }
-
-  /* Create progression status table */
-  cinfo->coef_bits = (int (*)[DCTSIZE2])
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				cinfo->num_components*DCTSIZE2*SIZEOF(int));
-  coef_bit_ptr = & cinfo->coef_bits[0][0];
-  for (ci = 0; ci < cinfo->num_components; ci++) 
-    for (i = 0; i < DCTSIZE2; i++)
-      *coef_bit_ptr++ = -1;
-}
-
-#endif /* D_PROGRESSIVE_SUPPORTED */
diff --git a/3rdparty/libjpeg/jdsample.c b/3rdparty/libjpeg/jdsample.c
index 80ffefb2a1..7bc8885b02 100644
--- a/3rdparty/libjpeg/jdsample.c
+++ b/3rdparty/libjpeg/jdsample.c
@@ -2,13 +2,14 @@
  * jdsample.c
  *
  * Copyright (C) 1991-1996, Thomas G. Lane.
+ * Modified 2002-2008 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
  * This file contains upsampling routines.
  *
  * Upsampling input data is counted in "row groups".  A row group
- * is defined to be (v_samp_factor * DCT_scaled_size / min_DCT_scaled_size)
+ * is defined to be (v_samp_factor * DCT_v_scaled_size / min_DCT_v_scaled_size)
  * sample rows of each component.  Upsampling will normally produce
  * max_v_samp_factor pixel rows from each row group (but this could vary
  * if the upsampler is applying a scale factor of its own).
@@ -237,11 +238,11 @@ h2v1_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   register JSAMPROW inptr, outptr;
   register JSAMPLE invalue;
   JSAMPROW outend;
-  int inrow;
+  int outrow;
 
-  for (inrow = 0; inrow < cinfo->max_v_samp_factor; inrow++) {
-    inptr = input_data[inrow];
-    outptr = output_data[inrow];
+  for (outrow = 0; outrow < cinfo->max_v_samp_factor; outrow++) {
+    inptr = input_data[outrow];
+    outptr = output_data[outrow];
     outend = outptr + cinfo->output_width;
     while (outptr < outend) {
       invalue = *inptr++;	/* don't need GETJSAMPLE() here */
@@ -285,112 +286,6 @@ h2v2_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 }
 
 
-/*
- * Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
- *
- * The upsampling algorithm is linear interpolation between pixel centers,
- * also known as a "triangle filter".  This is a good compromise between
- * speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
- * of the way between input pixel centers.
- *
- * A note about the "bias" calculations: when rounding fractional values to
- * integer, we do not want to always round 0.5 up to the next integer.
- * If we did that, we'd introduce a noticeable bias towards larger values.
- * Instead, this code is arranged so that 0.5 will be rounded up or down at
- * alternate pixel locations (a simple ordered dither pattern).
- */
-
-METHODDEF(void)
-h2v1_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-		     JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)
-{
-  JSAMPARRAY output_data = *output_data_ptr;
-  register JSAMPROW inptr, outptr;
-  register int invalue;
-  register JDIMENSION colctr;
-  int inrow;
-
-  for (inrow = 0; inrow < cinfo->max_v_samp_factor; inrow++) {
-    inptr = input_data[inrow];
-    outptr = output_data[inrow];
-    /* Special case for first column */
-    invalue = GETJSAMPLE(*inptr++);
-    *outptr++ = (JSAMPLE) invalue;
-    *outptr++ = (JSAMPLE) ((invalue * 3 + GETJSAMPLE(*inptr) + 2) >> 2);
-
-    for (colctr = compptr->downsampled_width - 2; colctr > 0; colctr--) {
-      /* General case: 3/4 * nearer pixel + 1/4 * further pixel */
-      invalue = GETJSAMPLE(*inptr++) * 3;
-      *outptr++ = (JSAMPLE) ((invalue + GETJSAMPLE(inptr[-2]) + 1) >> 2);
-      *outptr++ = (JSAMPLE) ((invalue + GETJSAMPLE(*inptr) + 2) >> 2);
-    }
-
-    /* Special case for last column */
-    invalue = GETJSAMPLE(*inptr);
-    *outptr++ = (JSAMPLE) ((invalue * 3 + GETJSAMPLE(inptr[-1]) + 1) >> 2);
-    *outptr++ = (JSAMPLE) invalue;
-  }
-}
-
-
-/*
- * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
- * Again a triangle filter; see comments for h2v1 case, above.
- *
- * It is OK for us to reference the adjacent input rows because we demanded
- * context from the main buffer controller (see initialization code).
- */
-
-METHODDEF(void)
-h2v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-		     JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)
-{
-  JSAMPARRAY output_data = *output_data_ptr;
-  register JSAMPROW inptr0, inptr1, outptr;
-#if BITS_IN_JSAMPLE == 8
-  register int thiscolsum, lastcolsum, nextcolsum;
-#else
-  register INT32 thiscolsum, lastcolsum, nextcolsum;
-#endif
-  register JDIMENSION colctr;
-  int inrow, outrow, v;
-
-  inrow = outrow = 0;
-  while (outrow < cinfo->max_v_samp_factor) {
-    for (v = 0; v < 2; v++) {
-      /* inptr0 points to nearest input row, inptr1 points to next nearest */
-      inptr0 = input_data[inrow];
-      if (v == 0)		/* next nearest is row above */
-	inptr1 = input_data[inrow-1];
-      else			/* next nearest is row below */
-	inptr1 = input_data[inrow+1];
-      outptr = output_data[outrow++];
-
-      /* Special case for first column */
-      thiscolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
-      nextcolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
-      *outptr++ = (JSAMPLE) ((thiscolsum * 4 + 8) >> 4);
-      *outptr++ = (JSAMPLE) ((thiscolsum * 3 + nextcolsum + 7) >> 4);
-      lastcolsum = thiscolsum; thiscolsum = nextcolsum;
-
-      for (colctr = compptr->downsampled_width - 2; colctr > 0; colctr--) {
-	/* General case: 3/4 * nearer pixel + 1/4 * further pixel in each */
-	/* dimension, thus 9/16, 3/16, 3/16, 1/16 overall */
-	nextcolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
-	*outptr++ = (JSAMPLE) ((thiscolsum * 3 + lastcolsum + 8) >> 4);
-	*outptr++ = (JSAMPLE) ((thiscolsum * 3 + nextcolsum + 7) >> 4);
-	lastcolsum = thiscolsum; thiscolsum = nextcolsum;
-      }
-
-      /* Special case for last column */
-      *outptr++ = (JSAMPLE) ((thiscolsum * 3 + lastcolsum + 8) >> 4);
-      *outptr++ = (JSAMPLE) ((thiscolsum * 4 + 7) >> 4);
-    }
-    inrow++;
-  }
-}
-
-
 /*
  * Module initialization routine for upsampling.
  */
@@ -401,7 +296,7 @@ jinit_upsampler (j_decompress_ptr cinfo)
   my_upsample_ptr upsample;
   int ci;
   jpeg_component_info * compptr;
-  boolean need_buffer, do_fancy;
+  boolean need_buffer;
   int h_in_group, v_in_group, h_out_group, v_out_group;
 
   upsample = (my_upsample_ptr)
@@ -415,11 +310,6 @@ jinit_upsampler (j_decompress_ptr cinfo)
   if (cinfo->CCIR601_sampling)	/* this isn't supported */
     ERREXIT(cinfo, JERR_CCIR601_NOTIMPL);
 
-  /* jdmainct.c doesn't support context rows when min_DCT_scaled_size = 1,
-   * so don't ask for it.
-   */
-  do_fancy = cinfo->do_fancy_upsampling && cinfo->min_DCT_scaled_size > 1;
-
   /* Verify we can handle the sampling factors, select per-component methods,
    * and create storage as needed.
    */
@@ -428,10 +318,10 @@ jinit_upsampler (j_decompress_ptr cinfo)
     /* Compute size of an "input group" after IDCT scaling.  This many samples
      * are to be converted to max_h_samp_factor * max_v_samp_factor pixels.
      */
-    h_in_group = (compptr->h_samp_factor * compptr->DCT_scaled_size) /
-		 cinfo->min_DCT_scaled_size;
-    v_in_group = (compptr->v_samp_factor * compptr->DCT_scaled_size) /
-		 cinfo->min_DCT_scaled_size;
+    h_in_group = (compptr->h_samp_factor * compptr->DCT_h_scaled_size) /
+		 cinfo->min_DCT_h_scaled_size;
+    v_in_group = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
+		 cinfo->min_DCT_v_scaled_size;
     h_out_group = cinfo->max_h_samp_factor;
     v_out_group = cinfo->max_v_samp_factor;
     upsample->rowgroup_height[ci] = v_in_group; /* save for use later */
@@ -446,19 +336,12 @@ jinit_upsampler (j_decompress_ptr cinfo)
       need_buffer = FALSE;
     } else if (h_in_group * 2 == h_out_group &&
 	       v_in_group == v_out_group) {
-      /* Special cases for 2h1v upsampling */
-      if (do_fancy && compptr->downsampled_width > 2)
-	upsample->methods[ci] = h2v1_fancy_upsample;
-      else
-	upsample->methods[ci] = h2v1_upsample;
+      /* Special case for 2h1v upsampling */
+      upsample->methods[ci] = h2v1_upsample;
     } else if (h_in_group * 2 == h_out_group &&
 	       v_in_group * 2 == v_out_group) {
-      /* Special cases for 2h2v upsampling */
-      if (do_fancy && compptr->downsampled_width > 2) {
-	upsample->methods[ci] = h2v2_fancy_upsample;
-	upsample->pub.need_context_rows = TRUE;
-      } else
-	upsample->methods[ci] = h2v2_upsample;
+      /* Special case for 2h2v upsampling */
+      upsample->methods[ci] = h2v2_upsample;
     } else if ((h_out_group % h_in_group) == 0 &&
 	       (v_out_group % v_in_group) == 0) {
       /* Generic integral-factors upsampling method */
diff --git a/3rdparty/libjpeg/jdtrans.c b/3rdparty/libjpeg/jdtrans.c
index 6c0ab715d3..22dd47fb5c 100644
--- a/3rdparty/libjpeg/jdtrans.c
+++ b/3rdparty/libjpeg/jdtrans.c
@@ -2,6 +2,7 @@
  * jdtrans.c
  *
  * Copyright (C) 1995-1997, Thomas G. Lane.
+ * Modified 2000-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -99,18 +100,14 @@ transdecode_master_selection (j_decompress_ptr cinfo)
   /* This is effectively a buffered-image operation. */
   cinfo->buffered_image = TRUE;
 
+  /* Compute output image dimensions and related values. */
+  jpeg_core_output_dimensions(cinfo);
+
   /* Entropy decoding: either Huffman or arithmetic coding. */
-  if (cinfo->arith_code) {
-    ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
-  } else {
-    if (cinfo->progressive_mode) {
-#ifdef D_PROGRESSIVE_SUPPORTED
-      jinit_phuff_decoder(cinfo);
-#else
-      ERREXIT(cinfo, JERR_NOT_COMPILED);
-#endif
-    } else
-      jinit_huff_decoder(cinfo);
+  if (cinfo->arith_code)
+    jinit_arith_decoder(cinfo);
+  else {
+    jinit_huff_decoder(cinfo);
   }
 
   /* Always get a full-image coefficient buffer. */
diff --git a/3rdparty/libjpeg/jerror.c b/3rdparty/libjpeg/jerror.c
index a44463cf18..3da7be86a0 100644
--- a/3rdparty/libjpeg/jerror.c
+++ b/3rdparty/libjpeg/jerror.c
@@ -23,7 +23,6 @@
 #include "jpeglib.h"
 #include "jversion.h"
 #include "jerror.h"
-#include <stdlib.h>
 
 #ifdef USE_WINDOWS_MESSAGEBOX
 #include <windows.h>
diff --git a/3rdparty/libjpeg/jerror.h b/3rdparty/libjpeg/jerror.h
index 41a69d5e24..1cfb2b19d8 100644
--- a/3rdparty/libjpeg/jerror.h
+++ b/3rdparty/libjpeg/jerror.h
@@ -2,6 +2,7 @@
  * jerror.h
  *
  * Copyright (C) 1994-1997, Thomas G. Lane.
+ * Modified 1997-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -39,15 +40,15 @@ typedef enum {
 JMESSAGE(JMSG_NOMESSAGE, "Bogus message code %d") /* Must be first entry! */
 
 /* For maintenance convenience, list is alphabetical by message code name */
-JMESSAGE(JERR_ARITH_NOTIMPL,
-	 "Sorry, there are legal restrictions on arithmetic coding")
 JMESSAGE(JERR_BAD_ALIGN_TYPE, "ALIGN_TYPE is wrong, please fix")
 JMESSAGE(JERR_BAD_ALLOC_CHUNK, "MAX_ALLOC_CHUNK is wrong, please fix")
 JMESSAGE(JERR_BAD_BUFFER_MODE, "Bogus buffer control mode")
 JMESSAGE(JERR_BAD_COMPONENT_ID, "Invalid component ID %d in SOS")
 JMESSAGE(JERR_BAD_CROP_SPEC, "Invalid crop request")
 JMESSAGE(JERR_BAD_DCT_COEF, "DCT coefficient out of range")
-JMESSAGE(JERR_BAD_DCTSIZE, "IDCT output block size %d not supported")
+JMESSAGE(JERR_BAD_DCTSIZE, "DCT scaled block size %dx%d not supported")
+JMESSAGE(JERR_BAD_DROP_SAMPLING,
+	 "Component index %d: mismatching sampling ratio %d:%d, %d:%d, %c")
 JMESSAGE(JERR_BAD_HUFF_TABLE, "Bogus Huffman table definition")
 JMESSAGE(JERR_BAD_IN_COLORSPACE, "Bogus input colorspace")
 JMESSAGE(JERR_BAD_J_COLORSPACE, "Bogus JPEG colorspace")
@@ -94,6 +95,7 @@ JMESSAGE(JERR_MISSING_DATA, "Scan script does not transmit all data")
 JMESSAGE(JERR_MODE_CHANGE, "Invalid color quantization mode change")
 JMESSAGE(JERR_NOTIMPL, "Not implemented yet")
 JMESSAGE(JERR_NOT_COMPILED, "Requested feature was omitted at compile time")
+JMESSAGE(JERR_NO_ARITH_TABLE, "Arithmetic table 0x%02x was not defined")
 JMESSAGE(JERR_NO_BACKING_STORE, "Backing store not supported")
 JMESSAGE(JERR_NO_HUFF_TABLE, "Huffman table 0x%02x was not defined")
 JMESSAGE(JERR_NO_IMAGE, "JPEG datastream contains no image")
@@ -171,6 +173,7 @@ JMESSAGE(JTRC_UNKNOWN_IDS,
 JMESSAGE(JTRC_XMS_CLOSE, "Freed XMS handle %u")
 JMESSAGE(JTRC_XMS_OPEN, "Obtained XMS handle %u")
 JMESSAGE(JWRN_ADOBE_XFORM, "Unknown Adobe color transform code %d")
+JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code")
 JMESSAGE(JWRN_BOGUS_PROGRESSION,
 	 "Inconsistent progression sequence for component %d coefficient %d")
 JMESSAGE(JWRN_EXTRANEOUS_DATA,
@@ -228,6 +231,15 @@ JMESSAGE(JWRN_TOO_MUCH_DATA, "Application transferred too many scanlines")
    (cinfo)->err->msg_parm.i[2] = (p3), \
    (cinfo)->err->msg_parm.i[3] = (p4), \
    (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
+#define ERREXIT6(cinfo,code,p1,p2,p3,p4,p5,p6)  \
+  ((cinfo)->err->msg_code = (code), \
+   (cinfo)->err->msg_parm.i[0] = (p1), \
+   (cinfo)->err->msg_parm.i[1] = (p2), \
+   (cinfo)->err->msg_parm.i[2] = (p3), \
+   (cinfo)->err->msg_parm.i[3] = (p4), \
+   (cinfo)->err->msg_parm.i[4] = (p5), \
+   (cinfo)->err->msg_parm.i[5] = (p6), \
+   (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
 #define ERREXITS(cinfo,code,str)  \
   ((cinfo)->err->msg_code = (code), \
    strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \
diff --git a/3rdparty/libjpeg/jfdctflt.c b/3rdparty/libjpeg/jfdctflt.c
index 79d7a00787..74d0d862dc 100644
--- a/3rdparty/libjpeg/jfdctflt.c
+++ b/3rdparty/libjpeg/jfdctflt.c
@@ -2,6 +2,7 @@
  * jfdctflt.c
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Modified 2003-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -56,41 +57,46 @@
  */
 
 GLOBAL(void)
-jpeg_fdct_float (FAST_FLOAT * data)
+jpeg_fdct_float (FAST_FLOAT * data, JSAMPARRAY sample_data, JDIMENSION start_col)
 {
   FAST_FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   FAST_FLOAT tmp10, tmp11, tmp12, tmp13;
   FAST_FLOAT z1, z2, z3, z4, z5, z11, z13;
   FAST_FLOAT *dataptr;
+  JSAMPROW elemptr;
   int ctr;
 
   /* Pass 1: process rows. */
 
   dataptr = data;
-  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
-    tmp0 = dataptr[0] + dataptr[7];
-    tmp7 = dataptr[0] - dataptr[7];
-    tmp1 = dataptr[1] + dataptr[6];
-    tmp6 = dataptr[1] - dataptr[6];
-    tmp2 = dataptr[2] + dataptr[5];
-    tmp5 = dataptr[2] - dataptr[5];
-    tmp3 = dataptr[3] + dataptr[4];
-    tmp4 = dataptr[3] - dataptr[4];
-    
+  for (ctr = 0; ctr < DCTSIZE; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Load data into workspace */
+    tmp0 = (FAST_FLOAT) (GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]));
+    tmp7 = (FAST_FLOAT) (GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]));
+    tmp1 = (FAST_FLOAT) (GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]));
+    tmp6 = (FAST_FLOAT) (GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]));
+    tmp2 = (FAST_FLOAT) (GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]));
+    tmp5 = (FAST_FLOAT) (GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]));
+    tmp3 = (FAST_FLOAT) (GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]));
+    tmp4 = (FAST_FLOAT) (GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]));
+
     /* Even part */
-    
+
     tmp10 = tmp0 + tmp3;	/* phase 2 */
     tmp13 = tmp0 - tmp3;
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
-    
-    dataptr[0] = tmp10 + tmp11; /* phase 3 */
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = tmp10 + tmp11 - 8 * CENTERJSAMPLE; /* phase 3 */
     dataptr[4] = tmp10 - tmp11;
-    
+
     z1 = (tmp12 + tmp13) * ((FAST_FLOAT) 0.707106781); /* c4 */
     dataptr[2] = tmp13 + z1;	/* phase 5 */
     dataptr[6] = tmp13 - z1;
-    
+
     /* Odd part */
 
     tmp10 = tmp4 + tmp5;	/* phase 2 */
@@ -126,21 +132,21 @@ jpeg_fdct_float (FAST_FLOAT * data)
     tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
     tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
     tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
-    
+
     /* Even part */
-    
+
     tmp10 = tmp0 + tmp3;	/* phase 2 */
     tmp13 = tmp0 - tmp3;
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
-    
+
     dataptr[DCTSIZE*0] = tmp10 + tmp11; /* phase 3 */
     dataptr[DCTSIZE*4] = tmp10 - tmp11;
-    
+
     z1 = (tmp12 + tmp13) * ((FAST_FLOAT) 0.707106781); /* c4 */
     dataptr[DCTSIZE*2] = tmp13 + z1; /* phase 5 */
     dataptr[DCTSIZE*6] = tmp13 - z1;
-    
+
     /* Odd part */
 
     tmp10 = tmp4 + tmp5;	/* phase 2 */
diff --git a/3rdparty/libjpeg/jfdctfst.c b/3rdparty/libjpeg/jfdctfst.c
index ccb378a3b4..8cad5f2293 100644
--- a/3rdparty/libjpeg/jfdctfst.c
+++ b/3rdparty/libjpeg/jfdctfst.c
@@ -2,6 +2,7 @@
  * jfdctfst.c
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Modified 2003-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -111,42 +112,47 @@
  */
 
 GLOBAL(void)
-jpeg_fdct_ifast (DCTELEM * data)
+jpeg_fdct_ifast (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
 {
   DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   DCTELEM tmp10, tmp11, tmp12, tmp13;
   DCTELEM z1, z2, z3, z4, z5, z11, z13;
   DCTELEM *dataptr;
+  JSAMPROW elemptr;
   int ctr;
   SHIFT_TEMPS
 
   /* Pass 1: process rows. */
 
   dataptr = data;
-  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
-    tmp0 = dataptr[0] + dataptr[7];
-    tmp7 = dataptr[0] - dataptr[7];
-    tmp1 = dataptr[1] + dataptr[6];
-    tmp6 = dataptr[1] - dataptr[6];
-    tmp2 = dataptr[2] + dataptr[5];
-    tmp5 = dataptr[2] - dataptr[5];
-    tmp3 = dataptr[3] + dataptr[4];
-    tmp4 = dataptr[3] - dataptr[4];
-    
+  for (ctr = 0; ctr < DCTSIZE; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Load data into workspace */
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
+    tmp7 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
+    tmp6 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
+    tmp5 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
+    tmp4 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
+
     /* Even part */
-    
+
     tmp10 = tmp0 + tmp3;	/* phase 2 */
     tmp13 = tmp0 - tmp3;
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
-    
-    dataptr[0] = tmp10 + tmp11; /* phase 3 */
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = tmp10 + tmp11 - 8 * CENTERJSAMPLE; /* phase 3 */
     dataptr[4] = tmp10 - tmp11;
-    
+
     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
     dataptr[2] = tmp13 + z1;	/* phase 5 */
     dataptr[6] = tmp13 - z1;
-    
+
     /* Odd part */
 
     tmp10 = tmp4 + tmp5;	/* phase 2 */
@@ -182,21 +188,21 @@ jpeg_fdct_ifast (DCTELEM * data)
     tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
     tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
     tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
-    
+
     /* Even part */
-    
+
     tmp10 = tmp0 + tmp3;	/* phase 2 */
     tmp13 = tmp0 - tmp3;
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
-    
+
     dataptr[DCTSIZE*0] = tmp10 + tmp11; /* phase 3 */
     dataptr[DCTSIZE*4] = tmp10 - tmp11;
-    
+
     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
     dataptr[DCTSIZE*2] = tmp13 + z1; /* phase 5 */
     dataptr[DCTSIZE*6] = tmp13 - z1;
-    
+
     /* Odd part */
 
     tmp10 = tmp4 + tmp5;	/* phase 2 */
diff --git a/3rdparty/libjpeg/jfdctint.c b/3rdparty/libjpeg/jfdctint.c
index 0a78b64aee..1dde58c499 100644
--- a/3rdparty/libjpeg/jfdctint.c
+++ b/3rdparty/libjpeg/jfdctint.c
@@ -2,6 +2,7 @@
  * jfdctint.c
  *
  * Copyright (C) 1991-1996, Thomas G. Lane.
+ * Modification developed 2003-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -21,6 +22,23 @@
  * The advantage of this method is that no data path contains more than one
  * multiplication; this allows a very simple and accurate implementation in
  * scaled fixed-point arithmetic, with a minimal number of shifts.
+ *
+ * We also provide FDCT routines with various input sample block sizes for
+ * direct resolution reduction or enlargement and for direct resolving the
+ * common 2x1 and 1x2 subsampling cases without additional resampling: NxN
+ * (N=1...16), 2NxN, and Nx2N (N=1...8) pixels for one 8x8 output DCT block.
+ *
+ * For N<8 we fill the remaining block coefficients with zero.
+ * For N>8 we apply a partial N-point FDCT on the input samples, computing
+ * just the lower 8 frequency coefficients and discarding the rest.
+ *
+ * We must scale the output coefficients of the N-point FDCT appropriately
+ * to the standard 8-point FDCT level by 8/N per 1-D pass.  This scaling
+ * is folded into the constant multipliers (pass 2) and/or final/initial
+ * shifting.
+ *
+ * CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
+ * since there would be too many additional constants to pre-calculate.
  */
 
 #define JPEG_INTERNALS
@@ -36,7 +54,7 @@
  */
 
 #if DCTSIZE != 8
-  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
+  Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
 #endif
 
 
@@ -137,12 +155,13 @@
  */
 
 GLOBAL(void)
-jpeg_fdct_islow (DCTELEM * data)
+jpeg_fdct_islow (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
 {
-  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  INT32 tmp0, tmp1, tmp2, tmp3;
   INT32 tmp10, tmp11, tmp12, tmp13;
-  INT32 z1, z2, z3, z4, z5;
+  INT32 z1;
   DCTELEM *dataptr;
+  JSAMPROW elemptr;
   int ctr;
   SHIFT_TEMPS
 
@@ -151,62 +170,74 @@ jpeg_fdct_islow (DCTELEM * data)
   /* furthermore, we scale the results by 2**PASS1_BITS. */
 
   dataptr = data;
-  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
-    tmp0 = dataptr[0] + dataptr[7];
-    tmp7 = dataptr[0] - dataptr[7];
-    tmp1 = dataptr[1] + dataptr[6];
-    tmp6 = dataptr[1] - dataptr[6];
-    tmp2 = dataptr[2] + dataptr[5];
-    tmp5 = dataptr[2] - dataptr[5];
-    tmp3 = dataptr[3] + dataptr[4];
-    tmp4 = dataptr[3] - dataptr[4];
-    
+  for (ctr = 0; ctr < DCTSIZE; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
     /* Even part per LL&M figure 1 --- note that published figure is faulty;
      * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
      */
-    
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
+
     tmp10 = tmp0 + tmp3;
-    tmp13 = tmp0 - tmp3;
+    tmp12 = tmp0 - tmp3;
     tmp11 = tmp1 + tmp2;
-    tmp12 = tmp1 - tmp2;
-    
-    dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS);
+    tmp13 = tmp1 - tmp2;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
+    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS);
     dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
-    
+
     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
-    dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
-				   CONST_BITS-PASS1_BITS);
-    dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
-				   CONST_BITS-PASS1_BITS);
-    
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+    dataptr[2] = (DCTELEM) RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865),
+				       CONST_BITS-PASS1_BITS);
+    dataptr[6] = (DCTELEM) RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065),
+				       CONST_BITS-PASS1_BITS);
+
     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
-     * cK represents cos(K*pi/16).
-     * i0..i3 in the paper are tmp4..tmp7 here.
+     * cK represents sqrt(2) * cos(K*pi/16).
+     * i0..i3 in the paper are tmp0..tmp3 here.
      */
-    
-    z1 = tmp4 + tmp7;
-    z2 = tmp5 + tmp6;
-    z3 = tmp4 + tmp6;
-    z4 = tmp5 + tmp7;
-    z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
-    
-    tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
-    tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
-    tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
-    tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
-    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
-    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
-    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
-    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
-    
-    z3 += z5;
-    z4 += z5;
-    
-    dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS);
-    dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS);
-    dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS);
-    dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS);
-    
+
+    tmp10 = tmp0 + tmp3;
+    tmp11 = tmp1 + tmp2;
+    tmp12 = tmp0 + tmp2;
+    tmp13 = tmp1 + tmp3;
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
+    tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
+    tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
+    tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
+    tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
+    tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
+    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
+    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
+
+    tmp12 += z1;
+    tmp13 += z1;
+
+    dataptr[1] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS);
+    dataptr[3] = (DCTELEM)
+      RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS);
+    dataptr[5] = (DCTELEM)
+      RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS);
+    dataptr[7] = (DCTELEM)
+      RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS);
+
     dataptr += DCTSIZE;		/* advance pointer to next row */
   }
 
@@ -217,67 +248,4101 @@ jpeg_fdct_islow (DCTELEM * data)
 
   dataptr = data;
   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part per LL&M figure 1 --- note that published figure is faulty;
+     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+     */
+
     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
-    tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
-    tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
     tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
-    tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
     tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
-    tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
-    
-    /* Even part per LL&M figure 1 --- note that published figure is faulty;
-     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
-     */
-    
-    tmp10 = tmp0 + tmp3;
-    tmp13 = tmp0 - tmp3;
+
+    /* Add fudge factor here for final descale. */
+    tmp10 = tmp0 + tmp3 + (ONE << (PASS1_BITS-1));
+    tmp12 = tmp0 - tmp3;
     tmp11 = tmp1 + tmp2;
-    tmp12 = tmp1 - tmp2;
-    
-    dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS);
-    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS);
-    
+    tmp13 = tmp1 - tmp2;
+
+    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
+    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
+    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
+    tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
+
+    dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
+    dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);
+
     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
-    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
-					   CONST_BITS+PASS1_BITS);
-    dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
-					   CONST_BITS+PASS1_BITS);
-    
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS+PASS1_BITS-1);
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), CONST_BITS+PASS1_BITS);
+
     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
-     * cK represents cos(K*pi/16).
-     * i0..i3 in the paper are tmp4..tmp7 here.
+     * cK represents sqrt(2) * cos(K*pi/16).
+     * i0..i3 in the paper are tmp0..tmp3 here.
      */
-    
-    z1 = tmp4 + tmp7;
-    z2 = tmp5 + tmp6;
-    z3 = tmp4 + tmp6;
-    z4 = tmp5 + tmp7;
-    z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
-    
-    tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
-    tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
-    tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
-    tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
-    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
-    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
-    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
-    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
-    
-    z3 += z5;
-    z4 += z5;
-    
-    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp4 + z1 + z3,
-					   CONST_BITS+PASS1_BITS);
-    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp5 + z2 + z4,
-					   CONST_BITS+PASS1_BITS);
-    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp6 + z2 + z3,
-					   CONST_BITS+PASS1_BITS);
-    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp7 + z1 + z4,
-					   CONST_BITS+PASS1_BITS);
-    
+
+    tmp10 = tmp0 + tmp3;
+    tmp11 = tmp1 + tmp2;
+    tmp12 = tmp0 + tmp2;
+    tmp13 = tmp1 + tmp3;
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS+PASS1_BITS-1);
+
+    tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
+    tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
+    tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
+    tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
+    tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
+    tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
+    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
+    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
+
+    tmp12 += z1;
+    tmp13 += z1;
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*5] = (DCTELEM)
+      RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*7] = (DCTELEM)
+      RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+#ifdef DCT_SCALING_SUPPORTED
+
+
+/*
+ * Perform the forward DCT on a 7x7 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_7x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3;
+  INT32 tmp10, tmp11, tmp12;
+  INT32 z1, z2, z3;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* cK represents sqrt(2) * cos(K*pi/14). */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 7; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
+    tmp3 = GETJSAMPLE(elemptr[3]);
+
+    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
+    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
+    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
+
+    z1 = tmp0 + tmp2;
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS);
+    tmp3 += tmp3;
+    z1 -= tmp3;
+    z1 -= tmp3;
+    z1 = MULTIPLY(z1, FIX(0.353553391));                /* (c2+c6-c4)/2 */
+    z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002));       /* (c2+c4-c6)/2 */
+    z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123));       /* c6 */
+    dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
+    z1 -= z2;
+    z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734));       /* c4 */
+    dataptr[4] = (DCTELEM)
+      DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
+	      CONST_BITS-PASS1_BITS);
+    dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
+
+    /* Odd part */
+
+    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347));   /* (c3+c1-c5)/2 */
+    tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339));   /* (c3+c5-c1)/2 */
+    tmp0 = tmp1 - tmp2;
+    tmp1 += tmp2;
+    tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.378756276)); /* -c1 */
+    tmp1 += tmp2;
+    tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.613604268));   /* c5 */
+    tmp0 += tmp3;
+    tmp2 += tmp3 + MULTIPLY(tmp12, FIX(1.870828693));   /* c3+c1-c5 */
+
+    dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
+    dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
+    dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/7)**2 = 64/49, which we fold
+   * into the constant multipliers:
+   * cK now represents sqrt(2) * cos(K*pi/14) * 64/49.
+   */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 7; ctr++) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*6];
+    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*5];
+    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*4];
+    tmp3 = dataptr[DCTSIZE*3];
+
+    tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*6];
+    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*5];
+    tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*4];
+
+    z1 = tmp0 + tmp2;
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(1.306122449)), /* 64/49 */
+	      CONST_BITS+PASS1_BITS);
+    tmp3 += tmp3;
+    z1 -= tmp3;
+    z1 -= tmp3;
+    z1 = MULTIPLY(z1, FIX(0.461784020));                /* (c2+c6-c4)/2 */
+    z2 = MULTIPLY(tmp0 - tmp2, FIX(1.202428084));       /* (c2+c4-c6)/2 */
+    z3 = MULTIPLY(tmp1 - tmp2, FIX(0.411026446));       /* c6 */
+    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS1_BITS);
+    z1 -= z2;
+    z2 = MULTIPLY(tmp0 - tmp1, FIX(1.151670509));       /* c4 */
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.923568041)), /* c2+c6-c4 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS1_BITS);
+
+    /* Odd part */
+
+    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.221765677));   /* (c3+c1-c5)/2 */
+    tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.222383464));   /* (c3+c5-c1)/2 */
+    tmp0 = tmp1 - tmp2;
+    tmp1 += tmp2;
+    tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.800824523)); /* -c1 */
+    tmp1 += tmp2;
+    tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.801442310));   /* c5 */
+    tmp0 += tmp3;
+    tmp2 += tmp3 + MULTIPLY(tmp12, FIX(2.443531355));   /* c3+c1-c5 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 6x6 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_6x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2;
+  INT32 tmp10, tmp11, tmp12;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* cK represents sqrt(2) * cos(K*pi/12). */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 6; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
+    tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
+
+    tmp10 = tmp0 + tmp2;
+    tmp12 = tmp0 - tmp2;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS);
+    dataptr[2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp12, FIX(1.224744871)),                 /* c2 */
+	      CONST_BITS-PASS1_BITS);
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
+	      CONST_BITS-PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)),     /* c5 */
+		    CONST_BITS-PASS1_BITS);
+
+    dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
+    dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
+    dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/6)**2 = 16/9, which we fold
+   * into the constant multipliers:
+   * cK now represents sqrt(2) * cos(K*pi/12) * 16/9.
+   */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 6; ctr++) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
+    tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
+    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
+
+    tmp10 = tmp0 + tmp2;
+    tmp12 = tmp0 - tmp2;
+
+    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
+    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
+    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)),         /* 16/9 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp12, FIX(2.177324216)),                 /* c2 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
+	      CONST_BITS+PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829));             /* c5 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),   /* 16/9 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)),    /* 16/9 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*5] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)),   /* 16/9 */
+	      CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 5x5 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_5x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2;
+  INT32 tmp10, tmp11;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* We scale the results further by 2 as part of output adaption */
+  /* scaling for different DCT size. */
+  /* cK represents sqrt(2) * cos(K*pi/10). */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 5; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
+    tmp2 = GETJSAMPLE(elemptr[2]);
+
+    tmp10 = tmp0 + tmp1;
+    tmp11 = tmp0 - tmp1;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << (PASS1_BITS+1));
+    tmp11 = MULTIPLY(tmp11, FIX(0.790569415));          /* (c2+c4)/2 */
+    tmp10 -= tmp2 << 2;
+    tmp10 = MULTIPLY(tmp10, FIX(0.353553391));          /* (c2-c4)/2 */
+    dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS-1);
+    dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS-1);
+
+    /* Odd part */
+
+    tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876));    /* c3 */
+
+    dataptr[1] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
+	      CONST_BITS-PASS1_BITS-1);
+    dataptr[3] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
+	      CONST_BITS-PASS1_BITS-1);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/5)**2 = 64/25, which we partially
+   * fold into the constant multipliers (other part was done in pass 1):
+   * cK now represents sqrt(2) * cos(K*pi/10) * 32/25.
+   */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 5; ctr++) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*4];
+    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*3];
+    tmp2 = dataptr[DCTSIZE*2];
+
+    tmp10 = tmp0 + tmp1;
+    tmp11 = tmp0 - tmp1;
+
+    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*4];
+    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*3];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 + tmp2, FIX(1.28)),        /* 32/25 */
+	      CONST_BITS+PASS1_BITS);
+    tmp11 = MULTIPLY(tmp11, FIX(1.011928851));          /* (c2+c4)/2 */
+    tmp10 -= tmp2 << 2;
+    tmp10 = MULTIPLY(tmp10, FIX(0.452548340));          /* (c2-c4)/2 */
+    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = MULTIPLY(tmp0 + tmp1, FIX(1.064004961));    /* c3 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.657591230)), /* c1-c3 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.785601151)), /* c1+c3 */
+	      CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 4x4 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_4x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1;
+  INT32 tmp10, tmp11;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* We must also scale the output by (8/4)**2 = 2**2, which we add here. */
+  /* cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT]. */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 4; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
+
+    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
+    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+2));
+    dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+2));
+
+    /* Odd part */
+
+    tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS-PASS1_BITS-3);
+
+    dataptr[1] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
+		  CONST_BITS-PASS1_BITS-2);
+    dataptr[3] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
+		  CONST_BITS-PASS1_BITS-2);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 4; ctr++) {
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3] + (ONE << (PASS1_BITS-1));
+    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
+
+    tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
+    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
+
+    dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
+    dataptr[DCTSIZE*2] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
+
+    /* Odd part */
+
+    tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS+PASS1_BITS-1);
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
+		  CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
+		  CONST_BITS+PASS1_BITS);
+
     dataptr++;			/* advance pointer to next column */
   }
 }
 
+
+/*
+ * Perform the forward DCT on a 3x3 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_3x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* We scale the results further by 2**2 as part of output adaption */
+  /* scaling for different DCT size. */
+  /* cK represents sqrt(2) * cos(K*pi/6). */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 3; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
+    tmp1 = GETJSAMPLE(elemptr[1]);
+
+    tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+2));
+    dataptr[2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
+	      CONST_BITS-PASS1_BITS-2);
+
+    /* Odd part */
+
+    dataptr[1] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp2, FIX(1.224744871)),               /* c1 */
+	      CONST_BITS-PASS1_BITS-2);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/3)**2 = 64/9, which we partially
+   * fold into the constant multipliers (other part was done in pass 1):
+   * cK now represents sqrt(2) * cos(K*pi/6) * 16/9.
+   */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 3; ctr++) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*2];
+    tmp1 = dataptr[DCTSIZE*1];
+
+    tmp2 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*2];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),        /* 16/9 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(1.257078722)), /* c2 */
+	      CONST_BITS+PASS1_BITS);
+
+    /* Odd part */
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp2, FIX(2.177324216)),               /* c1 */
+	      CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 2x2 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_2x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3;
+  JSAMPROW elemptr;
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT. */
+
+  /* Row 0 */
+  elemptr = sample_data[0] + start_col;
+
+  tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
+  tmp1 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);
+
+  /* Row 1 */
+  elemptr = sample_data[1] + start_col;
+
+  tmp2 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
+  tmp3 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);
+
+  /* Pass 2: process columns.
+   * We leave the results scaled up by an overall factor of 8.
+   * We must also scale the output by (8/2)**2 = 2**4.
+   */
+
+  /* Column 0 */
+  /* Apply unsigned->signed conversion */
+  data[DCTSIZE*0] = (DCTELEM) ((tmp0 + tmp2 - 4 * CENTERJSAMPLE) << 4);
+  data[DCTSIZE*1] = (DCTELEM) ((tmp0 - tmp2) << 4);
+
+  /* Column 1 */
+  data[DCTSIZE*0+1] = (DCTELEM) ((tmp1 + tmp3) << 4);
+  data[DCTSIZE*1+1] = (DCTELEM) ((tmp1 - tmp3) << 4);
+}
+
+
+/*
+ * Perform the forward DCT on a 1x1 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_1x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* We leave the result scaled up by an overall factor of 8. */
+  /* We must also scale the output by (8/1)**2 = 2**6. */
+  /* Apply unsigned->signed conversion */
+  data[0] = (DCTELEM)
+    ((GETJSAMPLE(sample_data[0][start_col]) - CENTERJSAMPLE) << 6);
+}
+
+
+/*
+ * Perform the forward DCT on a 9x9 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_9x9 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
+  INT32 tmp10, tmp11, tmp12, tmp13;
+  INT32 z1, z2;
+  DCTELEM workspace[8];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* we scale the results further by 2 as part of output adaption */
+  /* scaling for different DCT size. */
+  /* cK represents sqrt(2) * cos(K*pi/18). */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[8]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[7]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[6]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[5]);
+    tmp4 = GETJSAMPLE(elemptr[4]);
+
+    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[8]);
+    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[7]);
+    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[6]);
+    tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[5]);
+
+    z1 = tmp0 + tmp2 + tmp3;
+    z2 = tmp1 + tmp4;
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM) ((z1 + z2 - 9 * CENTERJSAMPLE) << 1);
+    dataptr[6] = (DCTELEM)
+      DESCALE(MULTIPLY(z1 - z2 - z2, FIX(0.707106781)),  /* c6 */
+	      CONST_BITS-1);
+    z1 = MULTIPLY(tmp0 - tmp2, FIX(1.328926049));        /* c2 */
+    z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(0.707106781)); /* c6 */
+    dataptr[2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.083350441))    /* c4 */
+	      + z1 + z2, CONST_BITS-1);
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.245575608))    /* c8 */
+	      + z1 - z2, CONST_BITS-1);
+
+    /* Odd part */
+
+    dataptr[3] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.224744871)), /* c3 */
+	      CONST_BITS-1);
+
+    tmp11 = MULTIPLY(tmp11, FIX(1.224744871));        /* c3 */
+    tmp0 = MULTIPLY(tmp10 + tmp12, FIX(0.909038955)); /* c5 */
+    tmp1 = MULTIPLY(tmp10 + tmp13, FIX(0.483689525)); /* c7 */
+
+    dataptr[1] = (DCTELEM) DESCALE(tmp11 + tmp0 + tmp1, CONST_BITS-1);
+
+    tmp2 = MULTIPLY(tmp12 - tmp13, FIX(1.392728481)); /* c1 */
+
+    dataptr[5] = (DCTELEM) DESCALE(tmp0 - tmp11 - tmp2, CONST_BITS-1);
+    dataptr[7] = (DCTELEM) DESCALE(tmp1 - tmp11 + tmp2, CONST_BITS-1);
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == 9)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We leave the results scaled up by an overall factor of 8.
+   * We must also scale the output by (8/9)**2 = 64/81, which we partially
+   * fold into the constant multipliers and final/initial shifting:
+   * cK now represents sqrt(2) * cos(K*pi/18) * 128/81.
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*0];
+    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*7];
+    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*6];
+    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*5];
+    tmp4 = dataptr[DCTSIZE*4];
+
+    tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*0];
+    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*7];
+    tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*6];
+    tmp13 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*5];
+
+    z1 = tmp0 + tmp2 + tmp3;
+    z2 = tmp1 + tmp4;
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(z1 + z2, FIX(1.580246914)),       /* 128/81 */
+	      CONST_BITS+2);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      DESCALE(MULTIPLY(z1 - z2 - z2, FIX(1.117403309)),  /* c6 */
+	      CONST_BITS+2);
+    z1 = MULTIPLY(tmp0 - tmp2, FIX(2.100031287));        /* c2 */
+    z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(1.117403309)); /* c6 */
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.711961190))    /* c4 */
+	      + z1 + z2, CONST_BITS+2);
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.388070096))    /* c8 */
+	      + z1 - z2, CONST_BITS+2);
+
+    /* Odd part */
+
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.935399303)), /* c3 */
+	      CONST_BITS+2);
+
+    tmp11 = MULTIPLY(tmp11, FIX(1.935399303));        /* c3 */
+    tmp0 = MULTIPLY(tmp10 + tmp12, FIX(1.436506004)); /* c5 */
+    tmp1 = MULTIPLY(tmp10 + tmp13, FIX(0.764348879)); /* c7 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(tmp11 + tmp0 + tmp1, CONST_BITS+2);
+
+    tmp2 = MULTIPLY(tmp12 - tmp13, FIX(2.200854883)); /* c1 */
+
+    dataptr[DCTSIZE*5] = (DCTELEM)
+      DESCALE(tmp0 - tmp11 - tmp2, CONST_BITS+2);
+    dataptr[DCTSIZE*7] = (DCTELEM)
+      DESCALE(tmp1 - tmp11 + tmp2, CONST_BITS+2);
+
+    dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 10x10 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_10x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
+  DCTELEM workspace[8*2];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* we scale the results further by 2 as part of output adaption */
+  /* scaling for different DCT size. */
+  /* cK represents sqrt(2) * cos(K*pi/20). */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
+    tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
+    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);
+
+    tmp10 = tmp0 + tmp4;
+    tmp13 = tmp0 - tmp4;
+    tmp11 = tmp1 + tmp3;
+    tmp14 = tmp1 - tmp3;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
+    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
+    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << 1);
+    tmp12 += tmp12;
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
+	      MULTIPLY(tmp11 - tmp12, FIX(0.437016024)),  /* c8 */
+	      CONST_BITS-1);
+    tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876));    /* c6 */
+    dataptr[2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)),  /* c2-c6 */
+	      CONST_BITS-1);
+    dataptr[6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)),  /* c2+c6 */
+	      CONST_BITS-1);
+
+    /* Odd part */
+
+    tmp10 = tmp0 + tmp4;
+    tmp11 = tmp1 - tmp3;
+    dataptr[5] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << 1);
+    tmp2 <<= CONST_BITS;
+    dataptr[1] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0, FIX(1.396802247)) +          /* c1 */
+	      MULTIPLY(tmp1, FIX(1.260073511)) + tmp2 +   /* c3 */
+	      MULTIPLY(tmp3, FIX(0.642039522)) +          /* c7 */
+	      MULTIPLY(tmp4, FIX(0.221231742)),           /* c9 */
+	      CONST_BITS-1);
+    tmp12 = MULTIPLY(tmp0 - tmp4, FIX(0.951056516)) -     /* (c3+c7)/2 */
+	    MULTIPLY(tmp1 + tmp3, FIX(0.587785252));      /* (c1-c9)/2 */
+    tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.309016994)) +   /* (c3-c7)/2 */
+	    (tmp11 << (CONST_BITS - 1)) - tmp2;
+    dataptr[3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-1);
+    dataptr[7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-1);
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == 10)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We leave the results scaled up by an overall factor of 8.
+   * We must also scale the output by (8/10)**2 = 16/25, which we partially
+   * fold into the constant multipliers and final/initial shifting:
+   * cK now represents sqrt(2) * cos(K*pi/20) * 32/25.
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*1];
+    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*0];
+    tmp12 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*7];
+    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*6];
+    tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5];
+
+    tmp10 = tmp0 + tmp4;
+    tmp13 = tmp0 - tmp4;
+    tmp11 = tmp1 + tmp3;
+    tmp14 = tmp1 - tmp3;
+
+    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*1];
+    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*0];
+    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*7];
+    tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*6];
+    tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*5];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(1.28)), /* 32/25 */
+	      CONST_BITS+2);
+    tmp12 += tmp12;
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.464477191)) - /* c4 */
+	      MULTIPLY(tmp11 - tmp12, FIX(0.559380511)),  /* c8 */
+	      CONST_BITS+2);
+    tmp10 = MULTIPLY(tmp13 + tmp14, FIX(1.064004961));    /* c6 */
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.657591230)),  /* c2-c6 */
+	      CONST_BITS+2);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.785601151)),  /* c2+c6 */
+	      CONST_BITS+2);
+
+    /* Odd part */
+
+    tmp10 = tmp0 + tmp4;
+    tmp11 = tmp1 - tmp3;
+    dataptr[DCTSIZE*5] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(1.28)),  /* 32/25 */
+	      CONST_BITS+2);
+    tmp2 = MULTIPLY(tmp2, FIX(1.28));                     /* 32/25 */
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0, FIX(1.787906876)) +          /* c1 */
+	      MULTIPLY(tmp1, FIX(1.612894094)) + tmp2 +   /* c3 */
+	      MULTIPLY(tmp3, FIX(0.821810588)) +          /* c7 */
+	      MULTIPLY(tmp4, FIX(0.283176630)),           /* c9 */
+	      CONST_BITS+2);
+    tmp12 = MULTIPLY(tmp0 - tmp4, FIX(1.217352341)) -     /* (c3+c7)/2 */
+	    MULTIPLY(tmp1 + tmp3, FIX(0.752365123));      /* (c1-c9)/2 */
+    tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.395541753)) +   /* (c3-c7)/2 */
+	    MULTIPLY(tmp11, FIX(0.64)) - tmp2;            /* 16/25 */
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS+2);
+    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS+2);
+
+    dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on an 11x11 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_11x11 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
+  INT32 z1, z2, z3;
+  DCTELEM workspace[8*3];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* we scale the results further by 2 as part of output adaption */
+  /* scaling for different DCT size. */
+  /* cK represents sqrt(2) * cos(K*pi/22). */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[10]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[9]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[8]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[7]);
+    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[6]);
+    tmp5 = GETJSAMPLE(elemptr[5]);
+
+    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[10]);
+    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[9]);
+    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[8]);
+    tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[7]);
+    tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[6]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 - 11 * CENTERJSAMPLE) << 1);
+    tmp5 += tmp5;
+    tmp0 -= tmp5;
+    tmp1 -= tmp5;
+    tmp2 -= tmp5;
+    tmp3 -= tmp5;
+    tmp4 -= tmp5;
+    z1 = MULTIPLY(tmp0 + tmp3, FIX(1.356927976)) +       /* c2 */
+	 MULTIPLY(tmp2 + tmp4, FIX(0.201263574));        /* c10 */
+    z2 = MULTIPLY(tmp1 - tmp3, FIX(0.926112931));        /* c6 */
+    z3 = MULTIPLY(tmp0 - tmp1, FIX(1.189712156));        /* c4 */
+    dataptr[2] = (DCTELEM)
+      DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.018300590)) /* c2+c8-c6 */
+	      - MULTIPLY(tmp4, FIX(1.390975730)),        /* c4+c10 */
+	      CONST_BITS-1);
+    dataptr[4] = (DCTELEM)
+      DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.062335650)) /* c4-c6-c10 */
+	      - MULTIPLY(tmp2, FIX(1.356927976))         /* c2 */
+	      + MULTIPLY(tmp4, FIX(0.587485545)),        /* c8 */
+	      CONST_BITS-1);
+    dataptr[6] = (DCTELEM)
+      DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(1.620527200)) /* c2+c4-c6 */
+	      - MULTIPLY(tmp2, FIX(0.788749120)),        /* c8+c10 */
+	      CONST_BITS-1);
+
+    /* Odd part */
+
+    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.286413905));    /* c3 */
+    tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.068791298));    /* c5 */
+    tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.764581576));    /* c7 */
+    tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(1.719967871)) /* c7+c5+c3-c1 */
+	   + MULTIPLY(tmp14, FIX(0.398430003));          /* c9 */
+    tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(0.764581576));  /* -c7 */
+    tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.399818907));  /* -c1 */
+    tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(1.276416582)) /* c9+c7+c1-c3 */
+	    - MULTIPLY(tmp14, FIX(1.068791298));         /* c5 */
+    tmp10 = MULTIPLY(tmp12 + tmp13, FIX(0.398430003));   /* c9 */
+    tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(1.989053629)) /* c9+c5+c3-c7 */
+	    + MULTIPLY(tmp14, FIX(1.399818907));         /* c1 */
+    tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(1.305598626)) /* c1+c5-c9-c7 */
+	    - MULTIPLY(tmp14, FIX(1.286413905));         /* c3 */
+
+    dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-1);
+    dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-1);
+    dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-1);
+    dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS-1);
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == 11)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We leave the results scaled up by an overall factor of 8.
+   * We must also scale the output by (8/11)**2 = 64/121, which we partially
+   * fold into the constant multipliers and final/initial shifting:
+   * cK now represents sqrt(2) * cos(K*pi/22) * 128/121.
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*2];
+    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*1];
+    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*0];
+    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*7];
+    tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*6];
+    tmp5 = dataptr[DCTSIZE*5];
+
+    tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*2];
+    tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*1];
+    tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*0];
+    tmp13 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*7];
+    tmp14 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*6];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5,
+		       FIX(1.057851240)),                /* 128/121 */
+	      CONST_BITS+2);
+    tmp5 += tmp5;
+    tmp0 -= tmp5;
+    tmp1 -= tmp5;
+    tmp2 -= tmp5;
+    tmp3 -= tmp5;
+    tmp4 -= tmp5;
+    z1 = MULTIPLY(tmp0 + tmp3, FIX(1.435427942)) +       /* c2 */
+	 MULTIPLY(tmp2 + tmp4, FIX(0.212906922));        /* c10 */
+    z2 = MULTIPLY(tmp1 - tmp3, FIX(0.979689713));        /* c6 */
+    z3 = MULTIPLY(tmp0 - tmp1, FIX(1.258538479));        /* c4 */
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.077210542)) /* c2+c8-c6 */
+	      - MULTIPLY(tmp4, FIX(1.471445400)),        /* c4+c10 */
+	      CONST_BITS+2);
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.065941844)) /* c4-c6-c10 */
+	      - MULTIPLY(tmp2, FIX(1.435427942))         /* c2 */
+	      + MULTIPLY(tmp4, FIX(0.621472312)),        /* c8 */
+	      CONST_BITS+2);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(1.714276708)) /* c2+c4-c6 */
+	      - MULTIPLY(tmp2, FIX(0.834379234)),        /* c8+c10 */
+	      CONST_BITS+2);
+
+    /* Odd part */
+
+    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.360834544));    /* c3 */
+    tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.130622199));    /* c5 */
+    tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.808813568));    /* c7 */
+    tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(1.819470145)) /* c7+c5+c3-c1 */
+	   + MULTIPLY(tmp14, FIX(0.421479672));          /* c9 */
+    tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(0.808813568));  /* -c7 */
+    tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.480800167));  /* -c1 */
+    tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(1.350258864)) /* c9+c7+c1-c3 */
+	    - MULTIPLY(tmp14, FIX(1.130622199));         /* c5 */
+    tmp10 = MULTIPLY(tmp12 + tmp13, FIX(0.421479672));   /* c9 */
+    tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(2.104122847)) /* c9+c5+c3-c7 */
+	    + MULTIPLY(tmp14, FIX(1.480800167));         /* c1 */
+    tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(1.381129125)) /* c1+c5-c9-c7 */
+	    - MULTIPLY(tmp14, FIX(1.360834544));         /* c3 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+2);
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+2);
+    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+2);
+    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+2);
+
+    dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 12x12 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_12x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  DCTELEM workspace[8*4];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT. */
+  /* cK represents sqrt(2) * cos(K*pi/24). */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
+    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
+    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);
+
+    tmp10 = tmp0 + tmp5;
+    tmp13 = tmp0 - tmp5;
+    tmp11 = tmp1 + tmp4;
+    tmp14 = tmp1 - tmp4;
+    tmp12 = tmp2 + tmp3;
+    tmp15 = tmp2 - tmp3;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
+    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
+    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
+    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM) (tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE);
+    dataptr[6] = (DCTELEM) (tmp13 - tmp14 - tmp15);
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
+	      CONST_BITS);
+    dataptr[2] = (DCTELEM)
+      DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
+	      CONST_BITS);
+
+    /* Odd part */
+
+    tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100);    /* c9 */
+    tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865);   /* c3-c9 */
+    tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065);   /* c3+c9 */
+    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054));   /* c5 */
+    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669));   /* c7 */
+    tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
+	    + MULTIPLY(tmp5, FIX(0.184591911));        /* c11 */
+    tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */
+    tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */
+	    + MULTIPLY(tmp5, FIX(0.860918669));        /* c7 */
+    tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.725788011)) /* c1+c11-c7 */
+	    - MULTIPLY(tmp5, FIX(1.121971054));        /* c5 */
+    tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.306562965)) /* c3 */
+	    - MULTIPLY(tmp2 + tmp5, FIX_0_541196100);  /* c9 */
+
+    dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS);
+    dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS);
+    dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS);
+    dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS);
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == 12)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We leave the results scaled up by an overall factor of 8.
+   * We must also scale the output by (8/12)**2 = 4/9, which we partially
+   * fold into the constant multipliers and final shifting:
+   * cK now represents sqrt(2) * cos(K*pi/24) * 8/9.
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*3];
+    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*2];
+    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*1];
+    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*0];
+    tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*7];
+    tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*6];
+
+    tmp10 = tmp0 + tmp5;
+    tmp13 = tmp0 - tmp5;
+    tmp11 = tmp1 + tmp4;
+    tmp14 = tmp1 - tmp4;
+    tmp12 = tmp2 + tmp3;
+    tmp15 = tmp2 - tmp3;
+
+    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*3];
+    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*2];
+    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*1];
+    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*0];
+    tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*7];
+    tmp5 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*6];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(0.888888889)), /* 8/9 */
+	      CONST_BITS+1);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(0.888888889)), /* 8/9 */
+	      CONST_BITS+1);
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.088662108)),         /* c4 */
+	      CONST_BITS+1);
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp14 - tmp15, FIX(0.888888889)) +        /* 8/9 */
+	      MULTIPLY(tmp13 + tmp15, FIX(1.214244803)),         /* c2 */
+	      CONST_BITS+1);
+
+    /* Odd part */
+
+    tmp10 = MULTIPLY(tmp1 + tmp4, FIX(0.481063200));   /* c9 */
+    tmp14 = tmp10 + MULTIPLY(tmp1, FIX(0.680326102));  /* c3-c9 */
+    tmp15 = tmp10 - MULTIPLY(tmp4, FIX(1.642452502));  /* c3+c9 */
+    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(0.997307603));   /* c5 */
+    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.765261039));   /* c7 */
+    tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.516244403)) /* c5+c7-c1 */
+	    + MULTIPLY(tmp5, FIX(0.164081699));        /* c11 */
+    tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.164081699)); /* -c11 */
+    tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.079550144)) /* c1+c5-c11 */
+	    + MULTIPLY(tmp5, FIX(0.765261039));        /* c7 */
+    tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.645144899)) /* c1+c11-c7 */
+	    - MULTIPLY(tmp5, FIX(0.997307603));        /* c5 */
+    tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.161389302)) /* c3 */
+	    - MULTIPLY(tmp2 + tmp5, FIX(0.481063200)); /* c9 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+1);
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+1);
+    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+1);
+    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+1);
+
+    dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 13x13 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_13x13 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  INT32 z1, z2;
+  DCTELEM workspace[8*5];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT. */
+  /* cK represents sqrt(2) * cos(K*pi/26). */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[12]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[11]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[10]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[9]);
+    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[8]);
+    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[7]);
+    tmp6 = GETJSAMPLE(elemptr[6]);
+
+    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[12]);
+    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[11]);
+    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[10]);
+    tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[9]);
+    tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[8]);
+    tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[7]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      (tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6 - 13 * CENTERJSAMPLE);
+    tmp6 += tmp6;
+    tmp0 -= tmp6;
+    tmp1 -= tmp6;
+    tmp2 -= tmp6;
+    tmp3 -= tmp6;
+    tmp4 -= tmp6;
+    tmp5 -= tmp6;
+    dataptr[2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0, FIX(1.373119086)) +   /* c2 */
+	      MULTIPLY(tmp1, FIX(1.058554052)) +   /* c6 */
+	      MULTIPLY(tmp2, FIX(0.501487041)) -   /* c10 */
+	      MULTIPLY(tmp3, FIX(0.170464608)) -   /* c12 */
+	      MULTIPLY(tmp4, FIX(0.803364869)) -   /* c8 */
+	      MULTIPLY(tmp5, FIX(1.252223920)),    /* c4 */
+	      CONST_BITS);
+    z1 = MULTIPLY(tmp0 - tmp2, FIX(1.155388986)) - /* (c4+c6)/2 */
+	 MULTIPLY(tmp3 - tmp4, FIX(0.435816023)) - /* (c2-c10)/2 */
+	 MULTIPLY(tmp1 - tmp5, FIX(0.316450131));  /* (c8-c12)/2 */
+    z2 = MULTIPLY(tmp0 + tmp2, FIX(0.096834934)) - /* (c4-c6)/2 */
+	 MULTIPLY(tmp3 + tmp4, FIX(0.937303064)) + /* (c2+c10)/2 */
+	 MULTIPLY(tmp1 + tmp5, FIX(0.486914739));  /* (c8+c12)/2 */
+
+    dataptr[4] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS);
+    dataptr[6] = (DCTELEM) DESCALE(z1 - z2, CONST_BITS);
+
+    /* Odd part */
+
+    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.322312651));   /* c3 */
+    tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.163874945));   /* c5 */
+    tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.937797057)) +  /* c7 */
+	   MULTIPLY(tmp14 + tmp15, FIX(0.338443458));   /* c11 */
+    tmp0 = tmp1 + tmp2 + tmp3 -
+	   MULTIPLY(tmp10, FIX(2.020082300)) +          /* c3+c5+c7-c1 */
+	   MULTIPLY(tmp14, FIX(0.318774355));           /* c9-c11 */
+    tmp4 = MULTIPLY(tmp14 - tmp15, FIX(0.937797057)) -  /* c7 */
+	   MULTIPLY(tmp11 + tmp12, FIX(0.338443458));   /* c11 */
+    tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.163874945)); /* -c5 */
+    tmp1 += tmp4 + tmp5 +
+	    MULTIPLY(tmp11, FIX(0.837223564)) -         /* c5+c9+c11-c3 */
+	    MULTIPLY(tmp14, FIX(2.341699410));          /* c1+c7 */
+    tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(0.657217813)); /* -c9 */
+    tmp2 += tmp4 + tmp6 -
+	    MULTIPLY(tmp12, FIX(1.572116027)) +         /* c1+c5-c9-c11 */
+	    MULTIPLY(tmp15, FIX(2.260109708));          /* c3+c7 */
+    tmp3 += tmp5 + tmp6 +
+	    MULTIPLY(tmp13, FIX(2.205608352)) -         /* c3+c5+c9-c7 */
+	    MULTIPLY(tmp15, FIX(1.742345811));          /* c1+c11 */
+
+    dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS);
+    dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS);
+    dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS);
+    dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS);
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == 13)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We leave the results scaled up by an overall factor of 8.
+   * We must also scale the output by (8/13)**2 = 64/169, which we partially
+   * fold into the constant multipliers and final shifting:
+   * cK now represents sqrt(2) * cos(K*pi/26) * 128/169.
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*4];
+    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*3];
+    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*2];
+    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*1];
+    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*0];
+    tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*7];
+    tmp6 = dataptr[DCTSIZE*6];
+
+    tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*4];
+    tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*3];
+    tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*2];
+    tmp13 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*1];
+    tmp14 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*0];
+    tmp15 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*7];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6,
+		       FIX(0.757396450)),          /* 128/169 */
+	      CONST_BITS+1);
+    tmp6 += tmp6;
+    tmp0 -= tmp6;
+    tmp1 -= tmp6;
+    tmp2 -= tmp6;
+    tmp3 -= tmp6;
+    tmp4 -= tmp6;
+    tmp5 -= tmp6;
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0, FIX(1.039995521)) +   /* c2 */
+	      MULTIPLY(tmp1, FIX(0.801745081)) +   /* c6 */
+	      MULTIPLY(tmp2, FIX(0.379824504)) -   /* c10 */
+	      MULTIPLY(tmp3, FIX(0.129109289)) -   /* c12 */
+	      MULTIPLY(tmp4, FIX(0.608465700)) -   /* c8 */
+	      MULTIPLY(tmp5, FIX(0.948429952)),    /* c4 */
+	      CONST_BITS+1);
+    z1 = MULTIPLY(tmp0 - tmp2, FIX(0.875087516)) - /* (c4+c6)/2 */
+	 MULTIPLY(tmp3 - tmp4, FIX(0.330085509)) - /* (c2-c10)/2 */
+	 MULTIPLY(tmp1 - tmp5, FIX(0.239678205));  /* (c8-c12)/2 */
+    z2 = MULTIPLY(tmp0 + tmp2, FIX(0.073342435)) - /* (c4-c6)/2 */
+	 MULTIPLY(tmp3 + tmp4, FIX(0.709910013)) + /* (c2+c10)/2 */
+	 MULTIPLY(tmp1 + tmp5, FIX(0.368787494));  /* (c8+c12)/2 */
+
+    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+1);
+    dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 - z2, CONST_BITS+1);
+
+    /* Odd part */
+
+    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.001514908));   /* c3 */
+    tmp2 = MULTIPLY(tmp10 + tmp12, FIX(0.881514751));   /* c5 */
+    tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.710284161)) +  /* c7 */
+	   MULTIPLY(tmp14 + tmp15, FIX(0.256335874));   /* c11 */
+    tmp0 = tmp1 + tmp2 + tmp3 -
+	   MULTIPLY(tmp10, FIX(1.530003162)) +          /* c3+c5+c7-c1 */
+	   MULTIPLY(tmp14, FIX(0.241438564));           /* c9-c11 */
+    tmp4 = MULTIPLY(tmp14 - tmp15, FIX(0.710284161)) -  /* c7 */
+	   MULTIPLY(tmp11 + tmp12, FIX(0.256335874));   /* c11 */
+    tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(0.881514751)); /* -c5 */
+    tmp1 += tmp4 + tmp5 +
+	    MULTIPLY(tmp11, FIX(0.634110155)) -         /* c5+c9+c11-c3 */
+	    MULTIPLY(tmp14, FIX(1.773594819));          /* c1+c7 */
+    tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(0.497774438)); /* -c9 */
+    tmp2 += tmp4 + tmp6 -
+	    MULTIPLY(tmp12, FIX(1.190715098)) +         /* c1+c5-c9-c11 */
+	    MULTIPLY(tmp15, FIX(1.711799069));          /* c3+c7 */
+    tmp3 += tmp5 + tmp6 +
+	    MULTIPLY(tmp13, FIX(1.670519935)) -         /* c3+c5+c9-c7 */
+	    MULTIPLY(tmp15, FIX(1.319646532));          /* c1+c11 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+1);
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+1);
+    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+1);
+    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+1);
+
+    dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 14x14 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_14x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+  DCTELEM workspace[8*6];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT. */
+  /* cK represents sqrt(2) * cos(K*pi/28). */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
+    tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
+    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
+    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
+    tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);
+
+    tmp10 = tmp0 + tmp6;
+    tmp14 = tmp0 - tmp6;
+    tmp11 = tmp1 + tmp5;
+    tmp15 = tmp1 - tmp5;
+    tmp12 = tmp2 + tmp4;
+    tmp16 = tmp2 - tmp4;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
+    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
+    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
+    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
+    tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      (tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE);
+    tmp13 += tmp13;
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
+	      MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
+	      MULTIPLY(tmp12 - tmp13, FIX(0.881747734)),  /* c8 */
+	      CONST_BITS);
+
+    tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686));    /* c6 */
+
+    dataptr[2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590))   /* c2-c6 */
+	      + MULTIPLY(tmp16, FIX(0.613604268)),        /* c10 */
+	      CONST_BITS);
+    dataptr[6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954))   /* c6+c10 */
+	      - MULTIPLY(tmp16, FIX(1.378756276)),        /* c2 */
+	      CONST_BITS);
+
+    /* Odd part */
+
+    tmp10 = tmp1 + tmp2;
+    tmp11 = tmp5 - tmp4;
+    dataptr[7] = (DCTELEM) (tmp0 - tmp10 + tmp3 - tmp11 - tmp6);
+    tmp3 <<= CONST_BITS;
+    tmp10 = MULTIPLY(tmp10, - FIX(0.158341681));          /* -c13 */
+    tmp11 = MULTIPLY(tmp11, FIX(1.405321284));            /* c1 */
+    tmp10 += tmp11 - tmp3;
+    tmp11 = MULTIPLY(tmp0 + tmp2, FIX(1.197448846)) +     /* c5 */
+	    MULTIPLY(tmp4 + tmp6, FIX(0.752406978));      /* c9 */
+    dataptr[5] = (DCTELEM)
+      DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(2.373959773)) /* c3+c5-c13 */
+	      + MULTIPLY(tmp4, FIX(1.119999435)),         /* c1+c11-c9 */
+	      CONST_BITS);
+    tmp12 = MULTIPLY(tmp0 + tmp1, FIX(1.334852607)) +     /* c3 */
+	    MULTIPLY(tmp5 - tmp6, FIX(0.467085129));      /* c11 */
+    dataptr[3] = (DCTELEM)
+      DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.424103948)) /* c3-c9-c13 */
+	      - MULTIPLY(tmp5, FIX(3.069855259)),         /* c1+c5+c11 */
+	      CONST_BITS);
+    dataptr[1] = (DCTELEM)
+      DESCALE(tmp11 + tmp12 + tmp3 + tmp6 -
+	      MULTIPLY(tmp0 + tmp6, FIX(1.126980169)),    /* c3+c5-c1 */
+	      CONST_BITS);
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == 14)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We leave the results scaled up by an overall factor of 8.
+   * We must also scale the output by (8/14)**2 = 16/49, which we partially
+   * fold into the constant multipliers and final shifting:
+   * cK now represents sqrt(2) * cos(K*pi/28) * 32/49.
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*5];
+    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*4];
+    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*3];
+    tmp13 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*2];
+    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*1];
+    tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*0];
+    tmp6 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7];
+
+    tmp10 = tmp0 + tmp6;
+    tmp14 = tmp0 - tmp6;
+    tmp11 = tmp1 + tmp5;
+    tmp15 = tmp1 - tmp5;
+    tmp12 = tmp2 + tmp4;
+    tmp16 = tmp2 - tmp4;
+
+    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*5];
+    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*4];
+    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*3];
+    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*2];
+    tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*1];
+    tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*0];
+    tmp6 = dataptr[DCTSIZE*6] - dataptr[DCTSIZE*7];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13,
+		       FIX(0.653061224)),                 /* 32/49 */
+	      CONST_BITS+1);
+    tmp13 += tmp13;
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(0.832106052)) + /* c4 */
+	      MULTIPLY(tmp11 - tmp13, FIX(0.205513223)) - /* c12 */
+	      MULTIPLY(tmp12 - tmp13, FIX(0.575835255)),  /* c8 */
+	      CONST_BITS+1);
+
+    tmp10 = MULTIPLY(tmp14 + tmp15, FIX(0.722074570));    /* c6 */
+
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.178337691))   /* c2-c6 */
+	      + MULTIPLY(tmp16, FIX(0.400721155)),        /* c10 */
+	      CONST_BITS+1);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.122795725))   /* c6+c10 */
+	      - MULTIPLY(tmp16, FIX(0.900412262)),        /* c2 */
+	      CONST_BITS+1);
+
+    /* Odd part */
+
+    tmp10 = tmp1 + tmp2;
+    tmp11 = tmp5 - tmp4;
+    dataptr[DCTSIZE*7] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6,
+		       FIX(0.653061224)),                 /* 32/49 */
+	      CONST_BITS+1);
+    tmp3  = MULTIPLY(tmp3 , FIX(0.653061224));            /* 32/49 */
+    tmp10 = MULTIPLY(tmp10, - FIX(0.103406812));          /* -c13 */
+    tmp11 = MULTIPLY(tmp11, FIX(0.917760839));            /* c1 */
+    tmp10 += tmp11 - tmp3;
+    tmp11 = MULTIPLY(tmp0 + tmp2, FIX(0.782007410)) +     /* c5 */
+	    MULTIPLY(tmp4 + tmp6, FIX(0.491367823));      /* c9 */
+    dataptr[DCTSIZE*5] = (DCTELEM)
+      DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(1.550341076)) /* c3+c5-c13 */
+	      + MULTIPLY(tmp4, FIX(0.731428202)),         /* c1+c11-c9 */
+	      CONST_BITS+1);
+    tmp12 = MULTIPLY(tmp0 + tmp1, FIX(0.871740478)) +     /* c3 */
+	    MULTIPLY(tmp5 - tmp6, FIX(0.305035186));      /* c11 */
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.276965844)) /* c3-c9-c13 */
+	      - MULTIPLY(tmp5, FIX(2.004803435)),         /* c1+c5+c11 */
+	      CONST_BITS+1);
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(tmp11 + tmp12 + tmp3
+	      - MULTIPLY(tmp0, FIX(0.735987049))          /* c3+c5-c1 */
+	      - MULTIPLY(tmp6, FIX(0.082925825)),         /* c9-c11-c13 */
+	      CONST_BITS+1);
+
+    dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 15x15 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_15x15 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+  INT32 z1, z2, z3;
+  DCTELEM workspace[8*7];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT. */
+  /* cK represents sqrt(2) * cos(K*pi/30). */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[14]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[13]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[12]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[11]);
+    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[10]);
+    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[9]);
+    tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[8]);
+    tmp7 = GETJSAMPLE(elemptr[7]);
+
+    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[14]);
+    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[13]);
+    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[12]);
+    tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[11]);
+    tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[10]);
+    tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[9]);
+    tmp16 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[8]);
+
+    z1 = tmp0 + tmp4 + tmp5;
+    z2 = tmp1 + tmp3 + tmp6;
+    z3 = tmp2 + tmp7;
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM) (z1 + z2 + z3 - 15 * CENTERJSAMPLE);
+    z3 += z3;
+    dataptr[6] = (DCTELEM)
+      DESCALE(MULTIPLY(z1 - z3, FIX(1.144122806)) - /* c6 */
+	      MULTIPLY(z2 - z3, FIX(0.437016024)),  /* c12 */
+	      CONST_BITS);
+    tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7;
+    z1 = MULTIPLY(tmp3 - tmp2, FIX(1.531135173)) -  /* c2+c14 */
+         MULTIPLY(tmp6 - tmp2, FIX(2.238241955));   /* c4+c8 */
+    z2 = MULTIPLY(tmp5 - tmp2, FIX(0.798468008)) -  /* c8-c14 */
+	 MULTIPLY(tmp0 - tmp2, FIX(0.091361227));   /* c2-c4 */
+    z3 = MULTIPLY(tmp0 - tmp3, FIX(1.383309603)) +  /* c2 */
+	 MULTIPLY(tmp6 - tmp5, FIX(0.946293579)) +  /* c8 */
+	 MULTIPLY(tmp1 - tmp4, FIX(0.790569415));   /* (c6+c12)/2 */
+
+    dataptr[2] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS);
+    dataptr[4] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS);
+
+    /* Odd part */
+
+    tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16,
+		    FIX(1.224744871));                         /* c5 */
+    tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(1.344997024)) + /* c3 */
+	   MULTIPLY(tmp11 - tmp13 - tmp16, FIX(0.831253876));  /* c9 */
+    tmp12 = MULTIPLY(tmp12, FIX(1.224744871));                 /* c5 */
+    tmp4 = MULTIPLY(tmp10 - tmp16, FIX(1.406466353)) +         /* c1 */
+	   MULTIPLY(tmp11 + tmp14, FIX(1.344997024)) +         /* c3 */
+	   MULTIPLY(tmp13 + tmp15, FIX(0.575212477));          /* c11 */
+    tmp0 = MULTIPLY(tmp13, FIX(0.475753014)) -                 /* c7-c11 */
+	   MULTIPLY(tmp14, FIX(0.513743148)) +                 /* c3-c9 */
+	   MULTIPLY(tmp16, FIX(1.700497885)) + tmp4 + tmp12;   /* c1+c13 */
+    tmp3 = MULTIPLY(tmp10, - FIX(0.355500862)) -               /* -(c1-c7) */
+	   MULTIPLY(tmp11, FIX(2.176250899)) -                 /* c3+c9 */
+	   MULTIPLY(tmp15, FIX(0.869244010)) + tmp4 - tmp12;   /* c11+c13 */
+
+    dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS);
+    dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS);
+    dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS);
+    dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS);
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == 15)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We leave the results scaled up by an overall factor of 8.
+   * We must also scale the output by (8/15)**2 = 64/225, which we partially
+   * fold into the constant multipliers and final shifting:
+   * cK now represents sqrt(2) * cos(K*pi/30) * 256/225.
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*6];
+    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*5];
+    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*4];
+    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*3];
+    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*2];
+    tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*1];
+    tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*0];
+    tmp7 = dataptr[DCTSIZE*7];
+
+    tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*6];
+    tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*5];
+    tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*4];
+    tmp13 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*3];
+    tmp14 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*2];
+    tmp15 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*1];
+    tmp16 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*0];
+
+    z1 = tmp0 + tmp4 + tmp5;
+    z2 = tmp1 + tmp3 + tmp6;
+    z3 = tmp2 + tmp7;
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(z1 + z2 + z3, FIX(1.137777778)), /* 256/225 */
+	      CONST_BITS+2);
+    z3 += z3;
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      DESCALE(MULTIPLY(z1 - z3, FIX(1.301757503)) - /* c6 */
+	      MULTIPLY(z2 - z3, FIX(0.497227121)),  /* c12 */
+	      CONST_BITS+2);
+    tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7;
+    z1 = MULTIPLY(tmp3 - tmp2, FIX(1.742091575)) -  /* c2+c14 */
+         MULTIPLY(tmp6 - tmp2, FIX(2.546621957));   /* c4+c8 */
+    z2 = MULTIPLY(tmp5 - tmp2, FIX(0.908479156)) -  /* c8-c14 */
+	 MULTIPLY(tmp0 - tmp2, FIX(0.103948774));   /* c2-c4 */
+    z3 = MULTIPLY(tmp0 - tmp3, FIX(1.573898926)) +  /* c2 */
+	 MULTIPLY(tmp6 - tmp5, FIX(1.076671805)) +  /* c8 */
+	 MULTIPLY(tmp1 - tmp4, FIX(0.899492312));   /* (c6+c12)/2 */
+
+    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS+2);
+    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS+2);
+
+    /* Odd part */
+
+    tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16,
+		    FIX(1.393487498));                         /* c5 */
+    tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(1.530307725)) + /* c3 */
+	   MULTIPLY(tmp11 - tmp13 - tmp16, FIX(0.945782187));  /* c9 */
+    tmp12 = MULTIPLY(tmp12, FIX(1.393487498));                 /* c5 */
+    tmp4 = MULTIPLY(tmp10 - tmp16, FIX(1.600246161)) +         /* c1 */
+	   MULTIPLY(tmp11 + tmp14, FIX(1.530307725)) +         /* c3 */
+	   MULTIPLY(tmp13 + tmp15, FIX(0.654463974));          /* c11 */
+    tmp0 = MULTIPLY(tmp13, FIX(0.541301207)) -                 /* c7-c11 */
+	   MULTIPLY(tmp14, FIX(0.584525538)) +                 /* c3-c9 */
+	   MULTIPLY(tmp16, FIX(1.934788705)) + tmp4 + tmp12;   /* c1+c13 */
+    tmp3 = MULTIPLY(tmp10, - FIX(0.404480980)) -               /* -(c1-c7) */
+	   MULTIPLY(tmp11, FIX(2.476089912)) -                 /* c3+c9 */
+	   MULTIPLY(tmp15, FIX(0.989006518)) + tmp4 - tmp12;   /* c11+c13 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+2);
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+2);
+    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+2);
+    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+2);
+
+    dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 16x16 sample block.
+ */
+
+GLOBAL(void)
+jpeg_fdct_16x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
+  DCTELEM workspace[DCTSIZE2];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* cK represents sqrt(2) * cos(K*pi/32). */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
+    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
+    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
+    tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
+    tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);
+
+    tmp10 = tmp0 + tmp7;
+    tmp14 = tmp0 - tmp7;
+    tmp11 = tmp1 + tmp6;
+    tmp15 = tmp1 - tmp6;
+    tmp12 = tmp2 + tmp5;
+    tmp16 = tmp2 - tmp5;
+    tmp13 = tmp3 + tmp4;
+    tmp17 = tmp3 - tmp4;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
+    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
+    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
+    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
+    tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
+    tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS);
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
+	      MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
+	      CONST_BITS-PASS1_BITS);
+
+    tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
+	    MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */
+
+    dataptr[2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
+	      + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+c10 */
+	      CONST_BITS-PASS1_BITS);
+    dataptr[6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
+	      - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
+	      CONST_BITS-PASS1_BITS);
+
+    /* Odd part */
+
+    tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) +         /* c3 */
+	    MULTIPLY(tmp6 - tmp7, FIX(0.410524528));          /* c13 */
+    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) +         /* c5 */
+	    MULTIPLY(tmp5 + tmp7, FIX(0.666655658));          /* c11 */
+    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) +         /* c7 */
+	    MULTIPLY(tmp4 - tmp7, FIX(0.897167586));          /* c9 */
+    tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) +         /* c15 */
+	    MULTIPLY(tmp6 - tmp5, FIX(1.407403738));          /* c1 */
+    tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) +       /* -c11 */
+	    MULTIPLY(tmp4 + tmp6, - FIX(1.247225013));        /* -c5 */
+    tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) +       /* -c3 */
+	    MULTIPLY(tmp5 - tmp4, FIX(0.410524528));          /* c13 */
+    tmp10 = tmp11 + tmp12 + tmp13 -
+	    MULTIPLY(tmp0, FIX(2.286341144)) +                /* c7+c5+c3-c1 */
+	    MULTIPLY(tmp7, FIX(0.779653625));                 /* c15+c13-c11+c9 */
+    tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
+	     - MULTIPLY(tmp6, FIX(1.663905119));              /* c7+c13+c1-c5 */
+    tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
+	     + MULTIPLY(tmp5, FIX(1.227391138));              /* c9-c11+c1-c13 */
+    tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
+	     + MULTIPLY(tmp4, FIX(2.167985692));              /* c1+c13+c5-c9 */
+
+    dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
+    dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
+    dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
+    dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == DCTSIZE * 2)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/16)**2 = 1/2**2.
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*7];
+    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*6];
+    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*5];
+    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*4];
+    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*3];
+    tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*2];
+    tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*1];
+    tmp7 = dataptr[DCTSIZE*7] + wsptr[DCTSIZE*0];
+
+    tmp10 = tmp0 + tmp7;
+    tmp14 = tmp0 - tmp7;
+    tmp11 = tmp1 + tmp6;
+    tmp15 = tmp1 - tmp6;
+    tmp12 = tmp2 + tmp5;
+    tmp16 = tmp2 - tmp5;
+    tmp13 = tmp3 + tmp4;
+    tmp17 = tmp3 - tmp4;
+
+    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*7];
+    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*6];
+    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*5];
+    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*4];
+    tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*3];
+    tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*2];
+    tmp6 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*1];
+    tmp7 = dataptr[DCTSIZE*7] - wsptr[DCTSIZE*0];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(tmp10 + tmp11 + tmp12 + tmp13, PASS1_BITS+2);
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
+	      MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
+	      CONST_BITS+PASS1_BITS+2);
+
+    tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
+	    MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */
+
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
+	      + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+10 */
+	      CONST_BITS+PASS1_BITS+2);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
+	      - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
+	      CONST_BITS+PASS1_BITS+2);
+
+    /* Odd part */
+
+    tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) +         /* c3 */
+	    MULTIPLY(tmp6 - tmp7, FIX(0.410524528));          /* c13 */
+    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) +         /* c5 */
+	    MULTIPLY(tmp5 + tmp7, FIX(0.666655658));          /* c11 */
+    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) +         /* c7 */
+	    MULTIPLY(tmp4 - tmp7, FIX(0.897167586));          /* c9 */
+    tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) +         /* c15 */
+	    MULTIPLY(tmp6 - tmp5, FIX(1.407403738));          /* c1 */
+    tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) +       /* -c11 */
+	    MULTIPLY(tmp4 + tmp6, - FIX(1.247225013));        /* -c5 */
+    tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) +       /* -c3 */
+	    MULTIPLY(tmp5 - tmp4, FIX(0.410524528));          /* c13 */
+    tmp10 = tmp11 + tmp12 + tmp13 -
+	    MULTIPLY(tmp0, FIX(2.286341144)) +                /* c7+c5+c3-c1 */
+	    MULTIPLY(tmp7, FIX(0.779653625));                 /* c15+c13-c11+c9 */
+    tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
+	     - MULTIPLY(tmp6, FIX(1.663905119));              /* c7+c13+c1-c5 */
+    tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
+	     + MULTIPLY(tmp5, FIX(1.227391138));              /* c9-c11+c1-c13 */
+    tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
+	     + MULTIPLY(tmp4, FIX(2.167985692));              /* c1+c13+c5-c9 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS+2);
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS+2);
+    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS+2);
+    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS+2);
+
+    dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 16x8 sample block.
+ *
+ * 16-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_16x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
+  INT32 z1;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32). */
+
+  dataptr = data;
+  ctr = 0;
+  for (ctr = 0; ctr < DCTSIZE; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
+    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
+    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
+    tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
+    tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);
+
+    tmp10 = tmp0 + tmp7;
+    tmp14 = tmp0 - tmp7;
+    tmp11 = tmp1 + tmp6;
+    tmp15 = tmp1 - tmp6;
+    tmp12 = tmp2 + tmp5;
+    tmp16 = tmp2 - tmp5;
+    tmp13 = tmp3 + tmp4;
+    tmp17 = tmp3 - tmp4;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
+    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
+    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
+    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
+    tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
+    tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS);
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
+	      MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
+	      CONST_BITS-PASS1_BITS);
+
+    tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
+	    MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */
+
+    dataptr[2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
+	      + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+c10 */
+	      CONST_BITS-PASS1_BITS);
+    dataptr[6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
+	      - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
+	      CONST_BITS-PASS1_BITS);
+
+    /* Odd part */
+
+    tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) +         /* c3 */
+	    MULTIPLY(tmp6 - tmp7, FIX(0.410524528));          /* c13 */
+    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) +         /* c5 */
+	    MULTIPLY(tmp5 + tmp7, FIX(0.666655658));          /* c11 */
+    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) +         /* c7 */
+	    MULTIPLY(tmp4 - tmp7, FIX(0.897167586));          /* c9 */
+    tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) +         /* c15 */
+	    MULTIPLY(tmp6 - tmp5, FIX(1.407403738));          /* c1 */
+    tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) +       /* -c11 */
+	    MULTIPLY(tmp4 + tmp6, - FIX(1.247225013));        /* -c5 */
+    tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) +       /* -c3 */
+	    MULTIPLY(tmp5 - tmp4, FIX(0.410524528));          /* c13 */
+    tmp10 = tmp11 + tmp12 + tmp13 -
+	    MULTIPLY(tmp0, FIX(2.286341144)) +                /* c7+c5+c3-c1 */
+	    MULTIPLY(tmp7, FIX(0.779653625));                 /* c15+c13-c11+c9 */
+    tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
+	     - MULTIPLY(tmp6, FIX(1.663905119));              /* c7+c13+c1-c5 */
+    tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
+	     + MULTIPLY(tmp5, FIX(1.227391138));              /* c9-c11+c1-c13 */
+    tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
+	     + MULTIPLY(tmp4, FIX(2.167985692));              /* c1+c13+c5-c9 */
+
+    dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
+    dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
+    dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
+    dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by 8/16 = 1/2.
+   */
+
+  dataptr = data;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part per LL&M figure 1 --- note that published figure is faulty;
+     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+     */
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
+    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
+    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
+    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
+
+    tmp10 = tmp0 + tmp3;
+    tmp12 = tmp0 - tmp3;
+    tmp11 = tmp1 + tmp2;
+    tmp13 = tmp1 - tmp2;
+
+    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
+    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
+    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
+    tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
+
+    dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS+1);
+    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS+1);
+
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
+    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865),
+					   CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065),
+					   CONST_BITS+PASS1_BITS+1);
+
+    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
+     * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
+     * i0..i3 in the paper are tmp0..tmp3 here.
+     */
+
+    tmp10 = tmp0 + tmp3;
+    tmp11 = tmp1 + tmp2;
+    tmp12 = tmp0 + tmp2;
+    tmp13 = tmp1 + tmp3;
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
+
+    tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
+    tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
+    tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
+    tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
+    tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
+    tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
+    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
+    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
+
+    tmp12 += z1;
+    tmp13 += z1;
+
+    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0 + tmp10 + tmp12,
+					   CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1 + tmp11 + tmp13,
+					   CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2 + tmp11 + tmp12,
+					   CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3 + tmp10 + tmp13,
+					   CONST_BITS+PASS1_BITS+1);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 14x7 sample block.
+ *
+ * 14-point FDCT in pass 1 (rows), 7-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_14x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+  INT32 z1, z2, z3;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Zero bottom row of output coefficient block. */
+  MEMZERO(&data[DCTSIZE*7], SIZEOF(DCTELEM) * DCTSIZE);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* 14-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/28). */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 7; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
+    tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
+    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
+    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
+    tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);
+
+    tmp10 = tmp0 + tmp6;
+    tmp14 = tmp0 - tmp6;
+    tmp11 = tmp1 + tmp5;
+    tmp15 = tmp1 - tmp5;
+    tmp12 = tmp2 + tmp4;
+    tmp16 = tmp2 - tmp4;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
+    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
+    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
+    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
+    tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE) << PASS1_BITS);
+    tmp13 += tmp13;
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
+	      MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
+	      MULTIPLY(tmp12 - tmp13, FIX(0.881747734)),  /* c8 */
+	      CONST_BITS-PASS1_BITS);
+
+    tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686));    /* c6 */
+
+    dataptr[2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590))   /* c2-c6 */
+	      + MULTIPLY(tmp16, FIX(0.613604268)),        /* c10 */
+	      CONST_BITS-PASS1_BITS);
+    dataptr[6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954))   /* c6+c10 */
+	      - MULTIPLY(tmp16, FIX(1.378756276)),        /* c2 */
+	      CONST_BITS-PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = tmp1 + tmp2;
+    tmp11 = tmp5 - tmp4;
+    dataptr[7] = (DCTELEM) ((tmp0 - tmp10 + tmp3 - tmp11 - tmp6) << PASS1_BITS);
+    tmp3 <<= CONST_BITS;
+    tmp10 = MULTIPLY(tmp10, - FIX(0.158341681));          /* -c13 */
+    tmp11 = MULTIPLY(tmp11, FIX(1.405321284));            /* c1 */
+    tmp10 += tmp11 - tmp3;
+    tmp11 = MULTIPLY(tmp0 + tmp2, FIX(1.197448846)) +     /* c5 */
+	    MULTIPLY(tmp4 + tmp6, FIX(0.752406978));      /* c9 */
+    dataptr[5] = (DCTELEM)
+      DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(2.373959773)) /* c3+c5-c13 */
+	      + MULTIPLY(tmp4, FIX(1.119999435)),         /* c1+c11-c9 */
+	      CONST_BITS-PASS1_BITS);
+    tmp12 = MULTIPLY(tmp0 + tmp1, FIX(1.334852607)) +     /* c3 */
+	    MULTIPLY(tmp5 - tmp6, FIX(0.467085129));      /* c11 */
+    dataptr[3] = (DCTELEM)
+      DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.424103948)) /* c3-c9-c13 */
+	      - MULTIPLY(tmp5, FIX(3.069855259)),         /* c1+c5+c11 */
+	      CONST_BITS-PASS1_BITS);
+    dataptr[1] = (DCTELEM)
+      DESCALE(tmp11 + tmp12 + tmp3 + tmp6 -
+	      MULTIPLY(tmp0 + tmp6, FIX(1.126980169)),    /* c3+c5-c1 */
+	      CONST_BITS-PASS1_BITS);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/14)*(8/7) = 32/49, which we
+   * partially fold into the constant multipliers and final shifting:
+   * 7-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/14) * 64/49.
+   */
+
+  dataptr = data;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*6];
+    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*5];
+    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*4];
+    tmp3 = dataptr[DCTSIZE*3];
+
+    tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*6];
+    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*5];
+    tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*4];
+
+    z1 = tmp0 + tmp2;
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(1.306122449)), /* 64/49 */
+	      CONST_BITS+PASS1_BITS+1);
+    tmp3 += tmp3;
+    z1 -= tmp3;
+    z1 -= tmp3;
+    z1 = MULTIPLY(z1, FIX(0.461784020));                /* (c2+c6-c4)/2 */
+    z2 = MULTIPLY(tmp0 - tmp2, FIX(1.202428084));       /* (c2+c4-c6)/2 */
+    z3 = MULTIPLY(tmp1 - tmp2, FIX(0.411026446));       /* c6 */
+    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS1_BITS+1);
+    z1 -= z2;
+    z2 = MULTIPLY(tmp0 - tmp1, FIX(1.151670509));       /* c4 */
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.923568041)), /* c2+c6-c4 */
+	      CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS1_BITS+1);
+
+    /* Odd part */
+
+    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.221765677));   /* (c3+c1-c5)/2 */
+    tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.222383464));   /* (c3+c5-c1)/2 */
+    tmp0 = tmp1 - tmp2;
+    tmp1 += tmp2;
+    tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.800824523)); /* -c1 */
+    tmp1 += tmp2;
+    tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.801442310));   /* c5 */
+    tmp0 += tmp3;
+    tmp2 += tmp3 + MULTIPLY(tmp12, FIX(2.443531355));   /* c3+c1-c5 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS+1);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 12x6 sample block.
+ *
+ * 12-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_12x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Zero 2 bottom rows of output coefficient block. */
+  MEMZERO(&data[DCTSIZE*6], SIZEOF(DCTELEM) * DCTSIZE * 2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* 12-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/24). */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 6; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
+    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
+    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);
+
+    tmp10 = tmp0 + tmp5;
+    tmp13 = tmp0 - tmp5;
+    tmp11 = tmp1 + tmp4;
+    tmp14 = tmp1 - tmp4;
+    tmp12 = tmp2 + tmp3;
+    tmp15 = tmp2 - tmp3;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
+    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
+    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
+    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE) << PASS1_BITS);
+    dataptr[6] = (DCTELEM) ((tmp13 - tmp14 - tmp15) << PASS1_BITS);
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
+	      CONST_BITS-PASS1_BITS);
+    dataptr[2] = (DCTELEM)
+      DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
+	      CONST_BITS-PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100);    /* c9 */
+    tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865);   /* c3-c9 */
+    tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065);   /* c3+c9 */
+    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054));   /* c5 */
+    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669));   /* c7 */
+    tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
+	    + MULTIPLY(tmp5, FIX(0.184591911));        /* c11 */
+    tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */
+    tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */
+	    + MULTIPLY(tmp5, FIX(0.860918669));        /* c7 */
+    tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.725788011)) /* c1+c11-c7 */
+	    - MULTIPLY(tmp5, FIX(1.121971054));        /* c5 */
+    tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.306562965)) /* c3 */
+	    - MULTIPLY(tmp2 + tmp5, FIX_0_541196100);  /* c9 */
+
+    dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
+    dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
+    dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
+    dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/12)*(8/6) = 8/9, which we
+   * partially fold into the constant multipliers and final shifting:
+   * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12) * 16/9.
+   */
+
+  dataptr = data;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
+    tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
+    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
+
+    tmp10 = tmp0 + tmp2;
+    tmp12 = tmp0 - tmp2;
+
+    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
+    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
+    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)),         /* 16/9 */
+	      CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp12, FIX(2.177324216)),                 /* c2 */
+	      CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
+	      CONST_BITS+PASS1_BITS+1);
+
+    /* Odd part */
+
+    tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829));             /* c5 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),   /* 16/9 */
+	      CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)),    /* 16/9 */
+	      CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*5] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)),   /* 16/9 */
+	      CONST_BITS+PASS1_BITS+1);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 10x5 sample block.
+ *
+ * 10-point FDCT in pass 1 (rows), 5-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_10x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Zero 3 bottom rows of output coefficient block. */
+  MEMZERO(&data[DCTSIZE*5], SIZEOF(DCTELEM) * DCTSIZE * 3);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* 10-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/20). */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 5; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
+    tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
+    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);
+
+    tmp10 = tmp0 + tmp4;
+    tmp13 = tmp0 - tmp4;
+    tmp11 = tmp1 + tmp3;
+    tmp14 = tmp1 - tmp3;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
+    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
+    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << PASS1_BITS);
+    tmp12 += tmp12;
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
+	      MULTIPLY(tmp11 - tmp12, FIX(0.437016024)),  /* c8 */
+	      CONST_BITS-PASS1_BITS);
+    tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876));    /* c6 */
+    dataptr[2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)),  /* c2-c6 */
+	      CONST_BITS-PASS1_BITS);
+    dataptr[6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)),  /* c2+c6 */
+	      CONST_BITS-PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = tmp0 + tmp4;
+    tmp11 = tmp1 - tmp3;
+    dataptr[5] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << PASS1_BITS);
+    tmp2 <<= CONST_BITS;
+    dataptr[1] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0, FIX(1.396802247)) +          /* c1 */
+	      MULTIPLY(tmp1, FIX(1.260073511)) + tmp2 +   /* c3 */
+	      MULTIPLY(tmp3, FIX(0.642039522)) +          /* c7 */
+	      MULTIPLY(tmp4, FIX(0.221231742)),           /* c9 */
+	      CONST_BITS-PASS1_BITS);
+    tmp12 = MULTIPLY(tmp0 - tmp4, FIX(0.951056516)) -     /* (c3+c7)/2 */
+	    MULTIPLY(tmp1 + tmp3, FIX(0.587785252));      /* (c1-c9)/2 */
+    tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.309016994)) +   /* (c3-c7)/2 */
+	    (tmp11 << (CONST_BITS - 1)) - tmp2;
+    dataptr[3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-PASS1_BITS);
+    dataptr[7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-PASS1_BITS);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/10)*(8/5) = 32/25, which we
+   * fold into the constant multipliers:
+   * 5-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/10) * 32/25.
+   */
+
+  dataptr = data;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*4];
+    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*3];
+    tmp2 = dataptr[DCTSIZE*2];
+
+    tmp10 = tmp0 + tmp1;
+    tmp11 = tmp0 - tmp1;
+
+    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*4];
+    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*3];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 + tmp2, FIX(1.28)),        /* 32/25 */
+	      CONST_BITS+PASS1_BITS);
+    tmp11 = MULTIPLY(tmp11, FIX(1.011928851));          /* (c2+c4)/2 */
+    tmp10 -= tmp2 << 2;
+    tmp10 = MULTIPLY(tmp10, FIX(0.452548340));          /* (c2-c4)/2 */
+    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = MULTIPLY(tmp0 + tmp1, FIX(1.064004961));    /* c3 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.657591230)), /* c1-c3 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.785601151)), /* c1+c3 */
+	      CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on an 8x4 sample block.
+ *
+ * 8-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_8x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3;
+  INT32 tmp10, tmp11, tmp12, tmp13;
+  INT32 z1;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Zero 4 bottom rows of output coefficient block. */
+  MEMZERO(&data[DCTSIZE*4], SIZEOF(DCTELEM) * DCTSIZE * 4);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* We must also scale the output by 8/4 = 2, which we add here. */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 4; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part per LL&M figure 1 --- note that published figure is faulty;
+     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+     */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
+
+    tmp10 = tmp0 + tmp3;
+    tmp12 = tmp0 - tmp3;
+    tmp11 = tmp1 + tmp2;
+    tmp13 = tmp1 - tmp2;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
+    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << (PASS1_BITS+1));
+    dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << (PASS1_BITS+1));
+
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS-PASS1_BITS-2);
+    dataptr[2] = (DCTELEM) RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865),
+				       CONST_BITS-PASS1_BITS-1);
+    dataptr[6] = (DCTELEM) RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065),
+				       CONST_BITS-PASS1_BITS-1);
+
+    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
+     * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
+     * i0..i3 in the paper are tmp0..tmp3 here.
+     */
+
+    tmp10 = tmp0 + tmp3;
+    tmp11 = tmp1 + tmp2;
+    tmp12 = tmp0 + tmp2;
+    tmp13 = tmp1 + tmp3;
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS-PASS1_BITS-2);
+
+    tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
+    tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
+    tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
+    tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
+    tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
+    tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
+    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
+    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
+
+    tmp12 += z1;
+    tmp13 += z1;
+
+    dataptr[1] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS-1);
+    dataptr[3] = (DCTELEM)
+      RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS-1);
+    dataptr[5] = (DCTELEM)
+      RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS-1);
+    dataptr[7] = (DCTELEM)
+      RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS-1);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * 4-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
+   */
+
+  dataptr = data;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3] + (ONE << (PASS1_BITS-1));
+    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
+
+    tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
+    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
+
+    dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
+    dataptr[DCTSIZE*2] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
+
+    /* Odd part */
+
+    tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);   /* c6 */
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS+PASS1_BITS-1);
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
+		  CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
+		  CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 6x3 sample block.
+ *
+ * 6-point FDCT in pass 1 (rows), 3-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_6x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2;
+  INT32 tmp10, tmp11, tmp12;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* We scale the results further by 2 as part of output adaption */
+  /* scaling for different DCT size. */
+  /* 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12). */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 3; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
+    tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
+
+    tmp10 = tmp0 + tmp2;
+    tmp12 = tmp0 - tmp2;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << (PASS1_BITS+1));
+    dataptr[2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp12, FIX(1.224744871)),                 /* c2 */
+	      CONST_BITS-PASS1_BITS-1);
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
+	      CONST_BITS-PASS1_BITS-1);
+
+    /* Odd part */
+
+    tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)),     /* c5 */
+		    CONST_BITS-PASS1_BITS-1);
+
+    dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << (PASS1_BITS+1)));
+    dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << (PASS1_BITS+1));
+    dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << (PASS1_BITS+1)));
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
+   * fold into the constant multipliers (other part was done in pass 1):
+   * 3-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/6) * 16/9.
+   */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 6; ctr++) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*2];
+    tmp1 = dataptr[DCTSIZE*1];
+
+    tmp2 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*2];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),        /* 16/9 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(1.257078722)), /* c2 */
+	      CONST_BITS+PASS1_BITS);
+
+    /* Odd part */
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp2, FIX(2.177324216)),               /* c1 */
+	      CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 4x2 sample block.
+ *
+ * 4-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_4x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1;
+  INT32 tmp10, tmp11;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* We must also scale the output by (8/4)*(8/2) = 2**3, which we add here. */
+  /* 4-point FDCT kernel, */
+  /* cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT]. */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 2; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
+
+    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
+    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+3));
+    dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+3));
+
+    /* Odd part */
+
+    tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS-PASS1_BITS-4);
+
+    dataptr[1] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
+		  CONST_BITS-PASS1_BITS-3);
+    dataptr[3] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
+		  CONST_BITS-PASS1_BITS-3);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 4; ctr++) {
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = dataptr[DCTSIZE*0] + (ONE << (PASS1_BITS-1));
+    tmp1 = dataptr[DCTSIZE*1];
+
+    dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
+
+    /* Odd part */
+
+    dataptr[DCTSIZE*1] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 2x1 sample block.
+ *
+ * 2-point FDCT in pass 1 (rows), 1-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_2x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1;
+  JSAMPROW elemptr;
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  elemptr = sample_data[0] + start_col;
+
+  tmp0 = GETJSAMPLE(elemptr[0]);
+  tmp1 = GETJSAMPLE(elemptr[1]);
+
+  /* We leave the results scaled up by an overall factor of 8.
+   * We must also scale the output by (8/2)*(8/1) = 2**5.
+   */
+
+  /* Even part */
+  /* Apply unsigned->signed conversion */
+  data[0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 5);
+
+  /* Odd part */
+  data[1] = (DCTELEM) ((tmp0 - tmp1) << 5);
+}
+
+
+/*
+ * Perform the forward DCT on an 8x16 sample block.
+ *
+ * 8-point FDCT in pass 1 (rows), 16-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_8x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
+  INT32 z1;
+  DCTELEM workspace[DCTSIZE2];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part per LL&M figure 1 --- note that published figure is faulty;
+     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+     */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
+
+    tmp10 = tmp0 + tmp3;
+    tmp12 = tmp0 - tmp3;
+    tmp11 = tmp1 + tmp2;
+    tmp13 = tmp1 - tmp2;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
+    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS);
+    dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
+
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
+    dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865),
+				   CONST_BITS-PASS1_BITS);
+    dataptr[6] = (DCTELEM) DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065),
+				   CONST_BITS-PASS1_BITS);
+
+    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
+     * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
+     * i0..i3 in the paper are tmp0..tmp3 here.
+     */
+
+    tmp10 = tmp0 + tmp3;
+    tmp11 = tmp1 + tmp2;
+    tmp12 = tmp0 + tmp2;
+    tmp13 = tmp1 + tmp3;
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
+
+    tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
+    tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
+    tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
+    tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
+    tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
+    tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
+    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
+    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
+
+    tmp12 += z1;
+    tmp13 += z1;
+
+    dataptr[1] = (DCTELEM) DESCALE(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS);
+    dataptr[3] = (DCTELEM) DESCALE(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS);
+    dataptr[5] = (DCTELEM) DESCALE(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS);
+    dataptr[7] = (DCTELEM) DESCALE(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS);
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == DCTSIZE * 2)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by 8/16 = 1/2.
+   * 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*7];
+    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*6];
+    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*5];
+    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*4];
+    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*3];
+    tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*2];
+    tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*1];
+    tmp7 = dataptr[DCTSIZE*7] + wsptr[DCTSIZE*0];
+
+    tmp10 = tmp0 + tmp7;
+    tmp14 = tmp0 - tmp7;
+    tmp11 = tmp1 + tmp6;
+    tmp15 = tmp1 - tmp6;
+    tmp12 = tmp2 + tmp5;
+    tmp16 = tmp2 - tmp5;
+    tmp13 = tmp3 + tmp4;
+    tmp17 = tmp3 - tmp4;
+
+    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*7];
+    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*6];
+    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*5];
+    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*4];
+    tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*3];
+    tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*2];
+    tmp6 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*1];
+    tmp7 = dataptr[DCTSIZE*7] - wsptr[DCTSIZE*0];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(tmp10 + tmp11 + tmp12 + tmp13, PASS1_BITS+1);
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
+	      MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
+	      CONST_BITS+PASS1_BITS+1);
+
+    tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
+	    MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */
+
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
+	      + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+c10 */
+	      CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
+	      - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
+	      CONST_BITS+PASS1_BITS+1);
+
+    /* Odd part */
+
+    tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) +         /* c3 */
+	    MULTIPLY(tmp6 - tmp7, FIX(0.410524528));          /* c13 */
+    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) +         /* c5 */
+	    MULTIPLY(tmp5 + tmp7, FIX(0.666655658));          /* c11 */
+    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) +         /* c7 */
+	    MULTIPLY(tmp4 - tmp7, FIX(0.897167586));          /* c9 */
+    tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) +         /* c15 */
+	    MULTIPLY(tmp6 - tmp5, FIX(1.407403738));          /* c1 */
+    tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) +       /* -c11 */
+	    MULTIPLY(tmp4 + tmp6, - FIX(1.247225013));        /* -c5 */
+    tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) +       /* -c3 */
+	    MULTIPLY(tmp5 - tmp4, FIX(0.410524528));          /* c13 */
+    tmp10 = tmp11 + tmp12 + tmp13 -
+	    MULTIPLY(tmp0, FIX(2.286341144)) +                /* c7+c5+c3-c1 */
+	    MULTIPLY(tmp7, FIX(0.779653625));                 /* c15+c13-c11+c9 */
+    tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
+	     - MULTIPLY(tmp6, FIX(1.663905119));              /* c7+c13+c1-c5 */
+    tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
+	     + MULTIPLY(tmp5, FIX(1.227391138));              /* c9-c11+c1-c13 */
+    tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
+	     + MULTIPLY(tmp4, FIX(2.167985692));              /* c1+c13+c5-c9 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS+1);
+    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS+1);
+
+    dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 7x14 sample block.
+ *
+ * 7-point FDCT in pass 1 (rows), 14-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_7x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+  INT32 z1, z2, z3;
+  DCTELEM workspace[8*6];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* 7-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/14). */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
+    tmp3 = GETJSAMPLE(elemptr[3]);
+
+    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
+    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
+    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
+
+    z1 = tmp0 + tmp2;
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS);
+    tmp3 += tmp3;
+    z1 -= tmp3;
+    z1 -= tmp3;
+    z1 = MULTIPLY(z1, FIX(0.353553391));                /* (c2+c6-c4)/2 */
+    z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002));       /* (c2+c4-c6)/2 */
+    z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123));       /* c6 */
+    dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
+    z1 -= z2;
+    z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734));       /* c4 */
+    dataptr[4] = (DCTELEM)
+      DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
+	      CONST_BITS-PASS1_BITS);
+    dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
+
+    /* Odd part */
+
+    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347));   /* (c3+c1-c5)/2 */
+    tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339));   /* (c3+c5-c1)/2 */
+    tmp0 = tmp1 - tmp2;
+    tmp1 += tmp2;
+    tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.378756276)); /* -c1 */
+    tmp1 += tmp2;
+    tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.613604268));   /* c5 */
+    tmp0 += tmp3;
+    tmp2 += tmp3 + MULTIPLY(tmp12, FIX(1.870828693));   /* c3+c1-c5 */
+
+    dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
+    dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
+    dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == 14)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/7)*(8/14) = 32/49, which we
+   * fold into the constant multipliers:
+   * 14-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/28) * 32/49.
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 7; ctr++) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*5];
+    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*4];
+    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*3];
+    tmp13 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*2];
+    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*1];
+    tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*0];
+    tmp6 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7];
+
+    tmp10 = tmp0 + tmp6;
+    tmp14 = tmp0 - tmp6;
+    tmp11 = tmp1 + tmp5;
+    tmp15 = tmp1 - tmp5;
+    tmp12 = tmp2 + tmp4;
+    tmp16 = tmp2 - tmp4;
+
+    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*5];
+    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*4];
+    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*3];
+    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*2];
+    tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*1];
+    tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*0];
+    tmp6 = dataptr[DCTSIZE*6] - dataptr[DCTSIZE*7];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13,
+		       FIX(0.653061224)),                 /* 32/49 */
+	      CONST_BITS+PASS1_BITS);
+    tmp13 += tmp13;
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(0.832106052)) + /* c4 */
+	      MULTIPLY(tmp11 - tmp13, FIX(0.205513223)) - /* c12 */
+	      MULTIPLY(tmp12 - tmp13, FIX(0.575835255)),  /* c8 */
+	      CONST_BITS+PASS1_BITS);
+
+    tmp10 = MULTIPLY(tmp14 + tmp15, FIX(0.722074570));    /* c6 */
+
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.178337691))   /* c2-c6 */
+	      + MULTIPLY(tmp16, FIX(0.400721155)),        /* c10 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.122795725))   /* c6+c10 */
+	      - MULTIPLY(tmp16, FIX(0.900412262)),        /* c2 */
+	      CONST_BITS+PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = tmp1 + tmp2;
+    tmp11 = tmp5 - tmp4;
+    dataptr[DCTSIZE*7] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6,
+		       FIX(0.653061224)),                 /* 32/49 */
+	      CONST_BITS+PASS1_BITS);
+    tmp3  = MULTIPLY(tmp3 , FIX(0.653061224));            /* 32/49 */
+    tmp10 = MULTIPLY(tmp10, - FIX(0.103406812));          /* -c13 */
+    tmp11 = MULTIPLY(tmp11, FIX(0.917760839));            /* c1 */
+    tmp10 += tmp11 - tmp3;
+    tmp11 = MULTIPLY(tmp0 + tmp2, FIX(0.782007410)) +     /* c5 */
+	    MULTIPLY(tmp4 + tmp6, FIX(0.491367823));      /* c9 */
+    dataptr[DCTSIZE*5] = (DCTELEM)
+      DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(1.550341076)) /* c3+c5-c13 */
+	      + MULTIPLY(tmp4, FIX(0.731428202)),         /* c1+c11-c9 */
+	      CONST_BITS+PASS1_BITS);
+    tmp12 = MULTIPLY(tmp0 + tmp1, FIX(0.871740478)) +     /* c3 */
+	    MULTIPLY(tmp5 - tmp6, FIX(0.305035186));      /* c11 */
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.276965844)) /* c3-c9-c13 */
+	      - MULTIPLY(tmp5, FIX(2.004803435)),         /* c1+c5+c11 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(tmp11 + tmp12 + tmp3
+	      - MULTIPLY(tmp0, FIX(0.735987049))          /* c3+c5-c1 */
+	      - MULTIPLY(tmp6, FIX(0.082925825)),         /* c9-c11-c13 */
+	      CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 6x12 sample block.
+ *
+ * 6-point FDCT in pass 1 (rows), 12-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_6x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  DCTELEM workspace[8*4];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12). */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
+    tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
+
+    tmp10 = tmp0 + tmp2;
+    tmp12 = tmp0 - tmp2;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
+    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS);
+    dataptr[2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp12, FIX(1.224744871)),                 /* c2 */
+	      CONST_BITS-PASS1_BITS);
+    dataptr[4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
+	      CONST_BITS-PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)),     /* c5 */
+		    CONST_BITS-PASS1_BITS);
+
+    dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
+    dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
+    dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == 12)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/6)*(8/12) = 8/9, which we
+   * fold into the constant multipliers:
+   * 12-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/24) * 8/9.
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 6; ctr++) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*3];
+    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*2];
+    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*1];
+    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*0];
+    tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*7];
+    tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*6];
+
+    tmp10 = tmp0 + tmp5;
+    tmp13 = tmp0 - tmp5;
+    tmp11 = tmp1 + tmp4;
+    tmp14 = tmp1 - tmp4;
+    tmp12 = tmp2 + tmp3;
+    tmp15 = tmp2 - tmp3;
+
+    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*3];
+    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*2];
+    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*1];
+    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*0];
+    tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*7];
+    tmp5 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*6];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(0.888888889)), /* 8/9 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(0.888888889)), /* 8/9 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.088662108)),         /* c4 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp14 - tmp15, FIX(0.888888889)) +        /* 8/9 */
+	      MULTIPLY(tmp13 + tmp15, FIX(1.214244803)),         /* c2 */
+	      CONST_BITS+PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = MULTIPLY(tmp1 + tmp4, FIX(0.481063200));   /* c9 */
+    tmp14 = tmp10 + MULTIPLY(tmp1, FIX(0.680326102));  /* c3-c9 */
+    tmp15 = tmp10 - MULTIPLY(tmp4, FIX(1.642452502));  /* c3+c9 */
+    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(0.997307603));   /* c5 */
+    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.765261039));   /* c7 */
+    tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.516244403)) /* c5+c7-c1 */
+	    + MULTIPLY(tmp5, FIX(0.164081699));        /* c11 */
+    tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.164081699)); /* -c11 */
+    tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.079550144)) /* c1+c5-c11 */
+	    + MULTIPLY(tmp5, FIX(0.765261039));        /* c7 */
+    tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.645144899)) /* c1+c11-c7 */
+	    - MULTIPLY(tmp5, FIX(0.997307603));        /* c5 */
+    tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.161389302)) /* c3 */
+	    - MULTIPLY(tmp2 + tmp5, FIX(0.481063200)); /* c9 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 5x10 sample block.
+ *
+ * 5-point FDCT in pass 1 (rows), 10-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_5x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
+  DCTELEM workspace[8*2];
+  DCTELEM *dataptr;
+  DCTELEM *wsptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* 5-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/10). */
+
+  dataptr = data;
+  ctr = 0;
+  for (;;) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
+    tmp2 = GETJSAMPLE(elemptr[2]);
+
+    tmp10 = tmp0 + tmp1;
+    tmp11 = tmp0 - tmp1;
+
+    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
+    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << PASS1_BITS);
+    tmp11 = MULTIPLY(tmp11, FIX(0.790569415));          /* (c2+c4)/2 */
+    tmp10 -= tmp2 << 2;
+    tmp10 = MULTIPLY(tmp10, FIX(0.353553391));          /* (c2-c4)/2 */
+    dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS);
+    dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876));    /* c3 */
+
+    dataptr[1] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
+	      CONST_BITS-PASS1_BITS);
+    dataptr[3] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
+	      CONST_BITS-PASS1_BITS);
+
+    ctr++;
+
+    if (ctr != DCTSIZE) {
+      if (ctr == 10)
+	break;			/* Done. */
+      dataptr += DCTSIZE;	/* advance pointer to next row */
+    } else
+      dataptr = workspace;	/* switch pointer to extended workspace */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/5)*(8/10) = 32/25, which we
+   * fold into the constant multipliers:
+   * 10-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/20) * 32/25.
+   */
+
+  dataptr = data;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 5; ctr++) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*1];
+    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*0];
+    tmp12 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*7];
+    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*6];
+    tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5];
+
+    tmp10 = tmp0 + tmp4;
+    tmp13 = tmp0 - tmp4;
+    tmp11 = tmp1 + tmp3;
+    tmp14 = tmp1 - tmp3;
+
+    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*1];
+    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*0];
+    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*7];
+    tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*6];
+    tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*5];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(1.28)), /* 32/25 */
+	      CONST_BITS+PASS1_BITS);
+    tmp12 += tmp12;
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.464477191)) - /* c4 */
+	      MULTIPLY(tmp11 - tmp12, FIX(0.559380511)),  /* c8 */
+	      CONST_BITS+PASS1_BITS);
+    tmp10 = MULTIPLY(tmp13 + tmp14, FIX(1.064004961));    /* c6 */
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.657591230)),  /* c2-c6 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.785601151)),  /* c2+c6 */
+	      CONST_BITS+PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = tmp0 + tmp4;
+    tmp11 = tmp1 - tmp3;
+    dataptr[DCTSIZE*5] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(1.28)),  /* 32/25 */
+	      CONST_BITS+PASS1_BITS);
+    tmp2 = MULTIPLY(tmp2, FIX(1.28));                     /* 32/25 */
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0, FIX(1.787906876)) +          /* c1 */
+	      MULTIPLY(tmp1, FIX(1.612894094)) + tmp2 +   /* c3 */
+	      MULTIPLY(tmp3, FIX(0.821810588)) +          /* c7 */
+	      MULTIPLY(tmp4, FIX(0.283176630)),           /* c9 */
+	      CONST_BITS+PASS1_BITS);
+    tmp12 = MULTIPLY(tmp0 - tmp4, FIX(1.217352341)) -     /* (c3+c7)/2 */
+	    MULTIPLY(tmp1 + tmp3, FIX(0.752365123));      /* (c1-c9)/2 */
+    tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.395541753)) +   /* (c3-c7)/2 */
+	    MULTIPLY(tmp11, FIX(0.64)) - tmp2;            /* 16/25 */
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+    wsptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 4x8 sample block.
+ *
+ * 4-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_4x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3;
+  INT32 tmp10, tmp11, tmp12, tmp13;
+  INT32 z1;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* We must also scale the output by 8/4 = 2, which we add here. */
+  /* 4-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16). */
+
+  dataptr = data;
+  for (ctr = 0; ctr < DCTSIZE; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
+
+    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
+    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+1));
+    dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+1));
+
+    /* Odd part */
+
+    tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS-PASS1_BITS-2);
+
+    dataptr[1] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
+		  CONST_BITS-PASS1_BITS-1);
+    dataptr[3] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
+		  CONST_BITS-PASS1_BITS-1);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 4; ctr++) {
+    /* Even part per LL&M figure 1 --- note that published figure is faulty;
+     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+     */
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
+    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
+    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
+    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
+
+    /* Add fudge factor here for final descale. */
+    tmp10 = tmp0 + tmp3 + (ONE << (PASS1_BITS-1));
+    tmp12 = tmp0 - tmp3;
+    tmp11 = tmp1 + tmp2;
+    tmp13 = tmp1 - tmp2;
+
+    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
+    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
+    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
+    tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
+
+    dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
+    dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);
+
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS+PASS1_BITS-1);
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*6] = (DCTELEM)
+      RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), CONST_BITS+PASS1_BITS);
+
+    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
+     * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
+     * i0..i3 in the paper are tmp0..tmp3 here.
+     */
+
+    tmp10 = tmp0 + tmp3;
+    tmp11 = tmp1 + tmp2;
+    tmp12 = tmp0 + tmp2;
+    tmp13 = tmp1 + tmp3;
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS+PASS1_BITS-1);
+
+    tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
+    tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
+    tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
+    tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
+    tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
+    tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
+    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
+    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
+
+    tmp12 += z1;
+    tmp13 += z1;
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*5] = (DCTELEM)
+      RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*7] = (DCTELEM)
+      RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 3x6 sample block.
+ *
+ * 3-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_3x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1, tmp2;
+  INT32 tmp10, tmp11, tmp12;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+  /* We scale the results further by 2 as part of output adaption */
+  /* scaling for different DCT size. */
+  /* 3-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/6). */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 6; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
+    tmp1 = GETJSAMPLE(elemptr[1]);
+
+    tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM)
+      ((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+1));
+    dataptr[2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
+	      CONST_BITS-PASS1_BITS-1);
+
+    /* Odd part */
+
+    dataptr[1] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp2, FIX(1.224744871)),               /* c1 */
+	      CONST_BITS-PASS1_BITS-1);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   * We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
+   * fold into the constant multipliers (other part was done in pass 1):
+   * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12) * 16/9.
+   */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 3; ctr++) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
+    tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
+    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
+
+    tmp10 = tmp0 + tmp2;
+    tmp12 = tmp0 - tmp2;
+
+    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
+    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
+    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
+
+    dataptr[DCTSIZE*0] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)),         /* 16/9 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*2] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp12, FIX(2.177324216)),                 /* c2 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*4] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
+	      CONST_BITS+PASS1_BITS);
+
+    /* Odd part */
+
+    tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829));             /* c5 */
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),   /* 16/9 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)),    /* 16/9 */
+	      CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*5] = (DCTELEM)
+      DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)),   /* 16/9 */
+	      CONST_BITS+PASS1_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 2x4 sample block.
+ *
+ * 2-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_2x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1;
+  INT32 tmp10, tmp11;
+  DCTELEM *dataptr;
+  JSAMPROW elemptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT. */
+  /* We must also scale the output by (8/2)*(8/4) = 2**3, which we add here. */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 4; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Even part */
+
+    tmp0 = GETJSAMPLE(elemptr[0]);
+    tmp1 = GETJSAMPLE(elemptr[1]);
+
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 3);
+
+    /* Odd part */
+
+    dataptr[1] = (DCTELEM) ((tmp0 - tmp1) << 3);
+
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We leave the results scaled up by an overall factor of 8.
+   * 4-point FDCT kernel,
+   * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
+   */
+
+  dataptr = data;
+  for (ctr = 0; ctr < 2; ctr++) {
+    /* Even part */
+
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3];
+    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
+
+    tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
+    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
+
+    dataptr[DCTSIZE*0] = (DCTELEM) (tmp0 + tmp1);
+    dataptr[DCTSIZE*2] = (DCTELEM) (tmp0 - tmp1);
+
+    /* Odd part */
+
+    tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS-1);
+
+    dataptr[DCTSIZE*1] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
+		  CONST_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM)
+      RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
+		  CONST_BITS);
+
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+
+/*
+ * Perform the forward DCT on a 1x2 sample block.
+ *
+ * 1-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
+ */
+
+GLOBAL(void)
+jpeg_fdct_1x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
+{
+  INT32 tmp0, tmp1;
+
+  /* Pre-zero output coefficient block. */
+  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
+
+  tmp0 = GETJSAMPLE(sample_data[0][start_col]);
+  tmp1 = GETJSAMPLE(sample_data[1][start_col]);
+
+  /* We leave the results scaled up by an overall factor of 8.
+   * We must also scale the output by (8/1)*(8/2) = 2**5.
+   */
+
+  /* Even part */
+  /* Apply unsigned->signed conversion */
+  data[DCTSIZE*0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 5);
+
+  /* Odd part */
+  data[DCTSIZE*1] = (DCTELEM) ((tmp0 - tmp1) << 5);
+}
+
+#endif /* DCT_SCALING_SUPPORTED */
 #endif /* DCT_ISLOW_SUPPORTED */
diff --git a/3rdparty/libjpeg/jidctflt.c b/3rdparty/libjpeg/jidctflt.c
index 0188ce3dfc..23ae9d333b 100644
--- a/3rdparty/libjpeg/jidctflt.c
+++ b/3rdparty/libjpeg/jidctflt.c
@@ -2,6 +2,7 @@
  * jidctflt.c
  *
  * Copyright (C) 1994-1998, Thomas G. Lane.
+ * Modified 2010 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -76,10 +77,9 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   FLOAT_MULT_TYPE * quantptr;
   FAST_FLOAT * wsptr;
   JSAMPROW outptr;
-  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  JSAMPLE *range_limit = cinfo->sample_range_limit;
   int ctr;
   FAST_FLOAT workspace[DCTSIZE2]; /* buffers data between passes */
-  SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
@@ -152,12 +152,12 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
     tmp11 = (z11 - z13) * ((FAST_FLOAT) 1.414213562); /* 2*c4 */
 
     z5 = (z10 + z12) * ((FAST_FLOAT) 1.847759065); /* 2*c2 */
-    tmp10 = ((FAST_FLOAT) 1.082392200) * z12 - z5; /* 2*(c2-c6) */
-    tmp12 = ((FAST_FLOAT) -2.613125930) * z10 + z5; /* -2*(c2+c6) */
+    tmp10 = z5 - z12 * ((FAST_FLOAT) 1.082392200); /* 2*(c2-c6) */
+    tmp12 = z5 - z10 * ((FAST_FLOAT) 2.613125930); /* 2*(c2+c6) */
 
     tmp6 = tmp12 - tmp7;	/* phase 2 */
     tmp5 = tmp11 - tmp6;
-    tmp4 = tmp10 + tmp5;
+    tmp4 = tmp10 - tmp5;
 
     wsptr[DCTSIZE*0] = tmp0 + tmp7;
     wsptr[DCTSIZE*7] = tmp0 - tmp7;
@@ -165,8 +165,8 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
     wsptr[DCTSIZE*6] = tmp1 - tmp6;
     wsptr[DCTSIZE*2] = tmp2 + tmp5;
     wsptr[DCTSIZE*5] = tmp2 - tmp5;
-    wsptr[DCTSIZE*4] = tmp3 + tmp4;
-    wsptr[DCTSIZE*3] = tmp3 - tmp4;
+    wsptr[DCTSIZE*3] = tmp3 + tmp4;
+    wsptr[DCTSIZE*4] = tmp3 - tmp4;
 
     inptr++;			/* advance pointers to next column */
     quantptr++;
@@ -174,7 +174,6 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   }
   
   /* Pass 2: process rows from work array, store into output array. */
-  /* Note that we must descale the results by a factor of 8 == 2**3. */
 
   wsptr = workspace;
   for (ctr = 0; ctr < DCTSIZE; ctr++) {
@@ -187,8 +186,10 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
     
     /* Even part */
 
-    tmp10 = wsptr[0] + wsptr[4];
-    tmp11 = wsptr[0] - wsptr[4];
+    /* Apply signed->unsigned and prepare float->int conversion */
+    z5 = wsptr[0] + ((FAST_FLOAT) CENTERJSAMPLE + (FAST_FLOAT) 0.5);
+    tmp10 = z5 + wsptr[4];
+    tmp11 = z5 - wsptr[4];
 
     tmp13 = wsptr[2] + wsptr[6];
     tmp12 = (wsptr[2] - wsptr[6]) * ((FAST_FLOAT) 1.414213562) - tmp13;
@@ -209,31 +210,23 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
     tmp11 = (z11 - z13) * ((FAST_FLOAT) 1.414213562);
 
     z5 = (z10 + z12) * ((FAST_FLOAT) 1.847759065); /* 2*c2 */
-    tmp10 = ((FAST_FLOAT) 1.082392200) * z12 - z5; /* 2*(c2-c6) */
-    tmp12 = ((FAST_FLOAT) -2.613125930) * z10 + z5; /* -2*(c2+c6) */
+    tmp10 = z5 - z12 * ((FAST_FLOAT) 1.082392200); /* 2*(c2-c6) */
+    tmp12 = z5 - z10 * ((FAST_FLOAT) 2.613125930); /* 2*(c2+c6) */
 
     tmp6 = tmp12 - tmp7;
     tmp5 = tmp11 - tmp6;
-    tmp4 = tmp10 + tmp5;
-
-    /* Final output stage: scale down by a factor of 8 and range-limit */
-
-    outptr[0] = range_limit[(int) DESCALE((INT32) (tmp0 + tmp7), 3)
-			    & RANGE_MASK];
-    outptr[7] = range_limit[(int) DESCALE((INT32) (tmp0 - tmp7), 3)
-			    & RANGE_MASK];
-    outptr[1] = range_limit[(int) DESCALE((INT32) (tmp1 + tmp6), 3)
-			    & RANGE_MASK];
-    outptr[6] = range_limit[(int) DESCALE((INT32) (tmp1 - tmp6), 3)
-			    & RANGE_MASK];
-    outptr[2] = range_limit[(int) DESCALE((INT32) (tmp2 + tmp5), 3)
-			    & RANGE_MASK];
-    outptr[5] = range_limit[(int) DESCALE((INT32) (tmp2 - tmp5), 3)
-			    & RANGE_MASK];
-    outptr[4] = range_limit[(int) DESCALE((INT32) (tmp3 + tmp4), 3)
-			    & RANGE_MASK];
-    outptr[3] = range_limit[(int) DESCALE((INT32) (tmp3 - tmp4), 3)
-			    & RANGE_MASK];
+    tmp4 = tmp10 - tmp5;
+
+    /* Final output stage: float->int conversion and range-limit */
+
+    outptr[0] = range_limit[((int) (tmp0 + tmp7)) & RANGE_MASK];
+    outptr[7] = range_limit[((int) (tmp0 - tmp7)) & RANGE_MASK];
+    outptr[1] = range_limit[((int) (tmp1 + tmp6)) & RANGE_MASK];
+    outptr[6] = range_limit[((int) (tmp1 - tmp6)) & RANGE_MASK];
+    outptr[2] = range_limit[((int) (tmp2 + tmp5)) & RANGE_MASK];
+    outptr[5] = range_limit[((int) (tmp2 - tmp5)) & RANGE_MASK];
+    outptr[3] = range_limit[((int) (tmp3 + tmp4)) & RANGE_MASK];
+    outptr[4] = range_limit[((int) (tmp3 - tmp4)) & RANGE_MASK];
     
     wsptr += DCTSIZE;		/* advance pointer to next row */
   }
diff --git a/3rdparty/libjpeg/jidctint.c b/3rdparty/libjpeg/jidctint.c
index a72b3207ca..dcdf7ce454 100644
--- a/3rdparty/libjpeg/jidctint.c
+++ b/3rdparty/libjpeg/jidctint.c
@@ -2,6 +2,7 @@
  * jidctint.c
  *
  * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Modification developed 2002-2009 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -23,6 +24,28 @@
  * The advantage of this method is that no data path contains more than one
  * multiplication; this allows a very simple and accurate implementation in
  * scaled fixed-point arithmetic, with a minimal number of shifts.
+ *
+ * We also provide IDCT routines with various output sample block sizes for
+ * direct resolution reduction or enlargement and for direct resolving the
+ * common 2x1 and 1x2 subsampling cases without additional resampling: NxN
+ * (N=1...16), 2NxN, and Nx2N (N=1...8) pixels for one 8x8 input DCT block.
+ *
+ * For N<8 we simply take the corresponding low-frequency coefficients of
+ * the 8x8 input DCT block and apply an NxN point IDCT on the sub-block
+ * to yield the downscaled outputs.
+ * This can be seen as direct low-pass downsampling from the DCT domain
+ * point of view rather than the usual spatial domain point of view,
+ * yielding significant computational savings and results at least
+ * as good as common bilinear (averaging) spatial downsampling.
+ *
+ * For N>8 we apply a partial NxN IDCT on the 8 input coefficients as
+ * lower frequencies and higher frequencies assumed to be zero.
+ * It turns out that the computational effort is similar to the 8x8 IDCT
+ * regarding the output size.
+ * Furthermore, the scaling and descaling is the same for all IDCT sizes.
+ *
+ * CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
+ * since there would be too many additional constants to pre-calculate.
  */
 
 #define JPEG_INTERNALS
@@ -38,7 +61,7 @@
  */
 
 #if DCTSIZE != 8
-  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
+  Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
 #endif
 
 
@@ -151,7 +174,7 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 {
   INT32 tmp0, tmp1, tmp2, tmp3;
   INT32 tmp10, tmp11, tmp12, tmp13;
-  INT32 z1, z2, z3, z4, z5;
+  INT32 z1, z2, z3;
   JCOEFPTR inptr;
   ISLOW_MULT_TYPE * quantptr;
   int * wsptr;
@@ -177,14 +200,14 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
      * With typical images and quantization tables, half or more of the
      * column DCT calculations can be simplified this way.
      */
-    
+
     if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
 	inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
 	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
 	inptr[DCTSIZE*7] == 0) {
       /* AC terms all zero */
       int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
-      
+
       wsptr[DCTSIZE*0] = dcval;
       wsptr[DCTSIZE*1] = dcval;
       wsptr[DCTSIZE*2] = dcval;
@@ -193,82 +216,84 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       wsptr[DCTSIZE*5] = dcval;
       wsptr[DCTSIZE*6] = dcval;
       wsptr[DCTSIZE*7] = dcval;
-      
+
       inptr++;			/* advance pointers to next column */
       quantptr++;
       wsptr++;
       continue;
     }
-    
+
     /* Even part: reverse the even part of the forward DCT. */
     /* The rotator is sqrt(2)*c(-6). */
     
     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
-    
+
     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
-    tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
-    tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
-    
+    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
+    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
+
     z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z2 <<= CONST_BITS;
+    z3 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    z2 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    tmp0 = z2 + z3;
+    tmp1 = z2 - z3;
+
+    tmp10 = tmp0 + tmp2;
+    tmp13 = tmp0 - tmp2;
+    tmp11 = tmp1 + tmp3;
+    tmp12 = tmp1 - tmp3;
 
-    tmp0 = (z2 + z3) << CONST_BITS;
-    tmp1 = (z2 - z3) << CONST_BITS;
-    
-    tmp10 = tmp0 + tmp3;
-    tmp13 = tmp0 - tmp3;
-    tmp11 = tmp1 + tmp2;
-    tmp12 = tmp1 - tmp2;
-    
     /* Odd part per figure 8; the matrix is unitary and hence its
      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
      */
-    
+
     tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
     tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
     tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
     tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
     
-    z1 = tmp0 + tmp3;
-    z2 = tmp1 + tmp2;
-    z3 = tmp0 + tmp2;
-    z4 = tmp1 + tmp3;
-    z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
-    
+    z2 = tmp0 + tmp2;
+    z3 = tmp1 + tmp3;
+
+    z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
+    z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+    z2 += z1;
+    z3 += z1;
+
+    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+    tmp0 += z1 + z2;
+    tmp3 += z1 + z3;
+
+    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
-    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
-    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
-    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
-    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
-    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
-    
-    z3 += z5;
-    z4 += z5;
-    
-    tmp0 += z1 + z3;
-    tmp1 += z2 + z4;
-    tmp2 += z2 + z3;
-    tmp3 += z1 + z4;
-    
+    tmp1 += z1 + z3;
+    tmp2 += z1 + z2;
+
     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
-    
-    wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
+
+    wsptr[DCTSIZE*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
     
     inptr++;			/* advance pointers to next column */
     quantptr++;
     wsptr++;
   }
-  
+
   /* Pass 2: process rows from work array, store into output array. */
   /* Note that we must descale the results by a factor of 8 == 2**3, */
   /* and also undo the PASS1_BITS scaling. */
@@ -283,14 +308,14 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
      * test takes more time than it's worth.  In that case this section
      * may be commented out.
      */
-    
+
 #ifndef NO_ZERO_ROW_TEST
     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
 	wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
       /* AC terms all zero */
       JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3)
 				  & RANGE_MASK];
-      
+
       outptr[0] = dcval;
       outptr[1] = dcval;
       outptr[2] = dcval;
@@ -304,86 +329,4809 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       continue;
     }
 #endif
-    
+
     /* Even part: reverse the even part of the forward DCT. */
     /* The rotator is sqrt(2)*c(-6). */
     
     z2 = (INT32) wsptr[2];
     z3 = (INT32) wsptr[6];
-    
+
     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
-    tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
-    tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
-    
-    tmp0 = ((INT32) wsptr[0] + (INT32) wsptr[4]) << CONST_BITS;
-    tmp1 = ((INT32) wsptr[0] - (INT32) wsptr[4]) << CONST_BITS;
-    
-    tmp10 = tmp0 + tmp3;
-    tmp13 = tmp0 - tmp3;
-    tmp11 = tmp1 + tmp2;
-    tmp12 = tmp1 - tmp2;
+    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
+    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
+
+    /* Add fudge factor here for final descale. */
+    z2 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z3 = (INT32) wsptr[4];
+
+    tmp0 = (z2 + z3) << CONST_BITS;
+    tmp1 = (z2 - z3) << CONST_BITS;
     
+    tmp10 = tmp0 + tmp2;
+    tmp13 = tmp0 - tmp2;
+    tmp11 = tmp1 + tmp3;
+    tmp12 = tmp1 - tmp3;
+
     /* Odd part per figure 8; the matrix is unitary and hence its
      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
      */
-    
+
     tmp0 = (INT32) wsptr[7];
     tmp1 = (INT32) wsptr[5];
     tmp2 = (INT32) wsptr[3];
     tmp3 = (INT32) wsptr[1];
-    
-    z1 = tmp0 + tmp3;
-    z2 = tmp1 + tmp2;
-    z3 = tmp0 + tmp2;
-    z4 = tmp1 + tmp3;
-    z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
-    
+
+    z2 = tmp0 + tmp2;
+    z3 = tmp1 + tmp3;
+
+    z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
+    z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+    z2 += z1;
+    z3 += z1;
+
+    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+    tmp0 += z1 + z2;
+    tmp3 += z1 + z3;
+
+    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
-    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
-    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
-    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
-    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
-    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
-    
-    z3 += z5;
-    z4 += z5;
-    
-    tmp0 += z1 + z3;
-    tmp1 += z2 + z4;
-    tmp2 += z2 + z3;
-    tmp3 += z1 + z4;
-    
+    tmp1 += z1 + z3;
+    tmp2 += z1 + z2;
+
     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
-    
-    outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp3,
-					  CONST_BITS+PASS1_BITS+3)
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp3,
+					      CONST_BITS+PASS1_BITS+3)
 			    & RANGE_MASK];
-    outptr[7] = range_limit[(int) DESCALE(tmp10 - tmp3,
-					  CONST_BITS+PASS1_BITS+3)
+    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp3,
+					      CONST_BITS+PASS1_BITS+3)
 			    & RANGE_MASK];
-    outptr[1] = range_limit[(int) DESCALE(tmp11 + tmp2,
-					  CONST_BITS+PASS1_BITS+3)
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp2,
+					      CONST_BITS+PASS1_BITS+3)
 			    & RANGE_MASK];
-    outptr[6] = range_limit[(int) DESCALE(tmp11 - tmp2,
-					  CONST_BITS+PASS1_BITS+3)
+    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp2,
+					      CONST_BITS+PASS1_BITS+3)
 			    & RANGE_MASK];
-    outptr[2] = range_limit[(int) DESCALE(tmp12 + tmp1,
-					  CONST_BITS+PASS1_BITS+3)
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp1,
+					      CONST_BITS+PASS1_BITS+3)
 			    & RANGE_MASK];
-    outptr[5] = range_limit[(int) DESCALE(tmp12 - tmp1,
-					  CONST_BITS+PASS1_BITS+3)
+    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp1,
+					      CONST_BITS+PASS1_BITS+3)
 			    & RANGE_MASK];
-    outptr[3] = range_limit[(int) DESCALE(tmp13 + tmp0,
-					  CONST_BITS+PASS1_BITS+3)
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
 			    & RANGE_MASK];
-    outptr[4] = range_limit[(int) DESCALE(tmp13 - tmp0,
-					  CONST_BITS+PASS1_BITS+3)
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
 			    & RANGE_MASK];
-    
+
     wsptr += DCTSIZE;		/* advance pointer to next row */
   }
 }
 
+#ifdef IDCT_SCALING_SUPPORTED
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 7x7 output block.
+ *
+ * Optimized algorithm with 12 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/14).
+ */
+
+GLOBAL(void)
+jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12, tmp13;
+  INT32 z1, z2, z3;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[7*7];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp13 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp13 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp13 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+    tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
+    tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
+    tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
+    tmp0 = z1 + z3;
+    z2 -= tmp0;
+    tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
+    tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536));  /* c2-c4-c6 */
+    tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249));  /* c2+c4+c6 */
+    tmp13 += MULTIPLY(z2, FIX(1.414213562));         /* c0 */
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+
+    tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
+    tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
+    tmp0 = tmp1 - tmp2;
+    tmp1 += tmp2;
+    tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276));    /* -c1 */
+    tmp1 += tmp2;
+    z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
+    tmp0 += z2;
+    tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693));     /* c3+c1-c5 */
+
+    /* Final output stage */
+
+    wsptr[7*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[7*6] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[7*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[7*5] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[7*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[7*4] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[7*3] = (int) RIGHT_SHIFT(tmp13, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 7 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 7; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp13 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp13 <<= CONST_BITS;
+
+    z1 = (INT32) wsptr[2];
+    z2 = (INT32) wsptr[4];
+    z3 = (INT32) wsptr[6];
+
+    tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
+    tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
+    tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
+    tmp0 = z1 + z3;
+    z2 -= tmp0;
+    tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
+    tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536));  /* c2-c4-c6 */
+    tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249));  /* c2+c4+c6 */
+    tmp13 += MULTIPLY(z2, FIX(1.414213562));         /* c0 */
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+
+    tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
+    tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
+    tmp0 = tmp1 - tmp2;
+    tmp1 += tmp2;
+    tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276));    /* -c1 */
+    tmp1 += tmp2;
+    z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
+    tmp0 += z2;
+    tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693));     /* c3+c1-c5 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 7;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 6x6 output block.
+ *
+ * Optimized algorithm with 3 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/12).
+ */
+
+GLOBAL(void)
+jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
+  INT32 z1, z2, z3;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[6*6];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp0 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
+    tmp1 = tmp0 + tmp10;
+    tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS-PASS1_BITS);
+    tmp10 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
+    tmp10 = tmp1 + tmp0;
+    tmp12 = tmp1 - tmp0;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
+    tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
+    tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
+    tmp1 = (z1 - z2 - z3) << PASS1_BITS;
+
+    /* Final output stage */
+
+    wsptr[6*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[6*5] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[6*1] = (int) (tmp11 + tmp1);
+    wsptr[6*4] = (int) (tmp11 - tmp1);
+    wsptr[6*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[6*3] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 6 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 6; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 <<= CONST_BITS;
+    tmp2 = (INT32) wsptr[4];
+    tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
+    tmp1 = tmp0 + tmp10;
+    tmp11 = tmp0 - tmp10 - tmp10;
+    tmp10 = (INT32) wsptr[2];
+    tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
+    tmp10 = tmp1 + tmp0;
+    tmp12 = tmp1 - tmp0;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
+    tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
+    tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
+    tmp1 = (z1 - z2 - z3) << CONST_BITS;
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 6;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 5x5 output block.
+ *
+ * Optimized algorithm with 5 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/10).
+ */
+
+GLOBAL(void)
+jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp10, tmp11, tmp12;
+  INT32 z1, z2, z3;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[5*5];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp12 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp12 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp12 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    tmp1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
+    z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
+    z3 = tmp12 + z2;
+    tmp10 = z3 + z1;
+    tmp11 = z3 - z1;
+    tmp12 -= z2 << 2;
+
+    /* Odd part */
+
+    z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+
+    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
+    tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
+    tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899));   /* c1+c3 */
+
+    /* Final output stage */
+
+    wsptr[5*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[5*4] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[5*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[5*3] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[5*2] = (int) RIGHT_SHIFT(tmp12, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 5 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 5; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp12 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp12 <<= CONST_BITS;
+    tmp0 = (INT32) wsptr[2];
+    tmp1 = (INT32) wsptr[4];
+    z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
+    z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
+    z3 = tmp12 + z2;
+    tmp10 = z3 + z1;
+    tmp11 = z3 - z1;
+    tmp12 -= z2 << 2;
+
+    /* Odd part */
+
+    z2 = (INT32) wsptr[1];
+    z3 = (INT32) wsptr[3];
+
+    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
+    tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
+    tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899));   /* c1+c3 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 5;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 4x4 output block.
+ *
+ * Optimized algorithm with 3 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
+ */
+
+GLOBAL(void)
+jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp2, tmp10, tmp12;
+  INT32 z1, z2, z3;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[4*4];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 4; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    
+    tmp10 = (tmp0 + tmp2) << PASS1_BITS;
+    tmp12 = (tmp0 - tmp2) << PASS1_BITS;
+
+    /* Odd part */
+    /* Same rotation as in the even part of the 8x8 LL&M IDCT */
+
+    z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+
+    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);               /* c6 */
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp0 = RIGHT_SHIFT(z1 + MULTIPLY(z2, FIX_0_765366865), /* c2-c6 */
+		       CONST_BITS-PASS1_BITS);
+    tmp2 = RIGHT_SHIFT(z1 - MULTIPLY(z3, FIX_1_847759065), /* c2+c6 */
+		       CONST_BITS-PASS1_BITS);
+
+    /* Final output stage */
+
+    wsptr[4*0] = (int) (tmp10 + tmp0);
+    wsptr[4*3] = (int) (tmp10 - tmp0);
+    wsptr[4*1] = (int) (tmp12 + tmp2);
+    wsptr[4*2] = (int) (tmp12 - tmp2);
+  }
+
+  /* Pass 2: process 4 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 4; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp2 = (INT32) wsptr[2];
+
+    tmp10 = (tmp0 + tmp2) << CONST_BITS;
+    tmp12 = (tmp0 - tmp2) << CONST_BITS;
+
+    /* Odd part */
+    /* Same rotation as in the even part of the 8x8 LL&M IDCT */
+
+    z2 = (INT32) wsptr[1];
+    z3 = (INT32) wsptr[3];
+
+    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
+    tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
+    tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 4;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 3x3 output block.
+ *
+ * Optimized algorithm with 2 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/6).
+ */
+
+GLOBAL(void)
+jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp2, tmp10, tmp12;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[3*3];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp0 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
+    tmp10 = tmp0 + tmp12;
+    tmp2 = tmp0 - tmp12 - tmp12;
+
+    /* Odd part */
+
+    tmp12 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
+
+    /* Final output stage */
+
+    wsptr[3*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[3*2] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[3*1] = (int) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 3 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 3; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 <<= CONST_BITS;
+    tmp2 = (INT32) wsptr[2];
+    tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
+    tmp10 = tmp0 + tmp12;
+    tmp2 = tmp0 - tmp12 - tmp12;
+
+    /* Odd part */
+
+    tmp12 = (INT32) wsptr[1];
+    tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 3;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 2x2 output block.
+ *
+ * Multiplication-less algorithm.
+ */
+
+GLOBAL(void)
+jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  ISLOW_MULT_TYPE * quantptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input. */
+
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+
+  /* Column 0 */
+  tmp4 = DEQUANTIZE(coef_block[DCTSIZE*0], quantptr[DCTSIZE*0]);
+  tmp5 = DEQUANTIZE(coef_block[DCTSIZE*1], quantptr[DCTSIZE*1]);
+  /* Add fudge factor here for final descale. */
+  tmp4 += ONE << 2;
+
+  tmp0 = tmp4 + tmp5;
+  tmp2 = tmp4 - tmp5;
+
+  /* Column 1 */
+  tmp4 = DEQUANTIZE(coef_block[DCTSIZE*0+1], quantptr[DCTSIZE*0+1]);
+  tmp5 = DEQUANTIZE(coef_block[DCTSIZE*1+1], quantptr[DCTSIZE*1+1]);
+
+  tmp1 = tmp4 + tmp5;
+  tmp3 = tmp4 - tmp5;
+
+  /* Pass 2: process 2 rows, store into output array. */
+
+  /* Row 0 */
+  outptr = output_buf[0] + output_col;
+
+  outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp0 + tmp1, 3) & RANGE_MASK];
+  outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp0 - tmp1, 3) & RANGE_MASK];
+
+  /* Row 1 */
+  outptr = output_buf[1] + output_col;
+
+  outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp2 + tmp3, 3) & RANGE_MASK];
+  outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2 - tmp3, 3) & RANGE_MASK];
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 1x1 output block.
+ *
+ * We hardly need an inverse DCT routine for this: just take the
+ * average pixel value, which is one-eighth of the DC coefficient.
+ */
+
+GLOBAL(void)
+jpeg_idct_1x1 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  int dcval;
+  ISLOW_MULT_TYPE * quantptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  SHIFT_TEMPS
+
+  /* 1x1 is trivial: just take the DC coefficient divided by 8. */
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  dcval = DEQUANTIZE(coef_block[0], quantptr[0]);
+  dcval = (int) DESCALE((INT32) dcval, 3);
+
+  output_buf[0][output_col] = range_limit[dcval & RANGE_MASK];
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 9x9 output block.
+ *
+ * Optimized algorithm with 10 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/18).
+ */
+
+GLOBAL(void)
+jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13, tmp14;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*9];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp0 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+    tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
+    tmp1 = tmp0 + tmp3;
+    tmp2 = tmp0 - tmp3 - tmp3;
+
+    tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
+    tmp11 = tmp2 + tmp0;
+    tmp14 = tmp2 - tmp0 - tmp0;
+
+    tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
+    tmp2 = MULTIPLY(z1, FIX(1.083350441));      /* c4 */
+    tmp3 = MULTIPLY(z2, FIX(0.245575608));      /* c8 */
+
+    tmp10 = tmp1 + tmp0 - tmp3;
+    tmp12 = tmp1 - tmp0 + tmp2;
+    tmp13 = tmp1 - tmp2 + tmp3;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+    z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
+
+    tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
+    tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
+    tmp0 = tmp2 + tmp3 - z2;
+    tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481));      /* c1 */
+    tmp2 += z2 - tmp1;
+    tmp3 += z2 + tmp1;
+    tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
+
+    /* Final output stage */
+
+    wsptr[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[8*8] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[8*7] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[8*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[8*6] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[8*3] = (int) RIGHT_SHIFT(tmp13 + tmp3, CONST_BITS-PASS1_BITS);
+    wsptr[8*5] = (int) RIGHT_SHIFT(tmp13 - tmp3, CONST_BITS-PASS1_BITS);
+    wsptr[8*4] = (int) RIGHT_SHIFT(tmp14, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 9 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 9; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 <<= CONST_BITS;
+
+    z1 = (INT32) wsptr[2];
+    z2 = (INT32) wsptr[4];
+    z3 = (INT32) wsptr[6];
+
+    tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
+    tmp1 = tmp0 + tmp3;
+    tmp2 = tmp0 - tmp3 - tmp3;
+
+    tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
+    tmp11 = tmp2 + tmp0;
+    tmp14 = tmp2 - tmp0 - tmp0;
+
+    tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
+    tmp2 = MULTIPLY(z1, FIX(1.083350441));      /* c4 */
+    tmp3 = MULTIPLY(z2, FIX(0.245575608));      /* c8 */
+
+    tmp10 = tmp1 + tmp0 - tmp3;
+    tmp12 = tmp1 - tmp0 + tmp2;
+    tmp13 = tmp1 - tmp2 + tmp3;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    z4 = (INT32) wsptr[7];
+
+    z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
+
+    tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
+    tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
+    tmp0 = tmp2 + tmp3 - z2;
+    tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481));      /* c1 */
+    tmp2 += z2 - tmp1;
+    tmp3 += z2 + tmp1;
+    tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp3,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp3,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp14,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 10x10 output block.
+ *
+ * Optimized algorithm with 12 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/20).
+ */
+
+GLOBAL(void)
+jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		 JCOEFPTR coef_block,
+		 JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
+  INT32 z1, z2, z3, z4, z5;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*10];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z3 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    z3 += ONE << (CONST_BITS-PASS1_BITS-1);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
+    z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
+    tmp10 = z3 + z1;
+    tmp11 = z3 - z2;
+
+    tmp22 = RIGHT_SHIFT(z3 - ((z1 - z2) << 1),   /* c0 = (c4-c8)*2 */
+			CONST_BITS-PASS1_BITS);
+
+    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
+    tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
+    tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
+
+    tmp20 = tmp10 + tmp12;
+    tmp24 = tmp10 - tmp12;
+    tmp21 = tmp11 + tmp13;
+    tmp23 = tmp11 - tmp13;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+    tmp11 = z2 + z4;
+    tmp13 = z2 - z4;
+
+    tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
+    z5 = z3 << CONST_BITS;
+
+    z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
+    z4 = z5 + tmp12;
+
+    tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
+    tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
+
+    z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
+    z4 = z5 - tmp12 - (tmp13 << (CONST_BITS - 1));
+
+    tmp12 = (z1 - tmp13 - z3) << PASS1_BITS;
+
+    tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
+    tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
+
+    /* Final output stage */
+
+    wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*9] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*8] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*2] = (int) (tmp22 + tmp12);
+    wsptr[8*7] = (int) (tmp22 - tmp12);
+    wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*6] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*5] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 10 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 10; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z3 <<= CONST_BITS;
+    z4 = (INT32) wsptr[4];
+    z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
+    z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
+    tmp10 = z3 + z1;
+    tmp11 = z3 - z2;
+
+    tmp22 = z3 - ((z1 - z2) << 1);               /* c0 = (c4-c8)*2 */
+
+    z2 = (INT32) wsptr[2];
+    z3 = (INT32) wsptr[6];
+
+    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
+    tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
+    tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
+
+    tmp20 = tmp10 + tmp12;
+    tmp24 = tmp10 - tmp12;
+    tmp21 = tmp11 + tmp13;
+    tmp23 = tmp11 - tmp13;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    z3 <<= CONST_BITS;
+    z4 = (INT32) wsptr[7];
+
+    tmp11 = z2 + z4;
+    tmp13 = z2 - z4;
+
+    tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
+
+    z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
+    z4 = z3 + tmp12;
+
+    tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
+    tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
+
+    z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
+    z4 = z3 - tmp12 - (tmp13 << (CONST_BITS - 1));
+
+    tmp12 = ((z1 - tmp13) << CONST_BITS) - z3;
+
+    tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
+    tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 11x11 output block.
+ *
+ * Optimized algorithm with 24 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/22).
+ */
+
+GLOBAL(void)
+jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		 JCOEFPTR coef_block,
+		 JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*11];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp10 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp10 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+    tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
+    tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
+    z4 = z1 + z3;
+    tmp24 = MULTIPLY(z4, - FIX(1.155664402));        /* -(c2-c10) */
+    z4 -= z2;
+    tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
+    tmp21 = tmp20 + tmp23 + tmp25 -
+	    MULTIPLY(z2, FIX(1.821790775));          /* c2+c4+c10-c6 */
+    tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
+    tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
+    tmp24 += tmp25;
+    tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120));  /* c8+c10 */
+    tmp24 += MULTIPLY(z2, FIX(1.944413522)) -        /* c2+c8 */
+	     MULTIPLY(z1, FIX(1.390975730));         /* c4+c10 */
+    tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562));  /* c0 */
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+    tmp11 = z1 + z2;
+    tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
+    tmp11 = MULTIPLY(tmp11, FIX(0.887983902));           /* c3-c9 */
+    tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295));         /* c5-c9 */
+    tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
+    tmp10 = tmp11 + tmp12 + tmp13 -
+	    MULTIPLY(z1, FIX(0.923107866));              /* c7+c5+c3-c1-2*c9 */
+    z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
+    tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
+    tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
+    z1    = MULTIPLY(z2 + z4, - FIX(1.798248910));       /* -(c1+c9) */
+    tmp11 += z1;
+    tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
+    tmp14 += MULTIPLY(z2, - FIX(1.467221301)) +          /* -(c5+c9) */
+	     MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
+	     MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
+
+    /* Final output stage */
+
+    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*10] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 11 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 11; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp10 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp10 <<= CONST_BITS;
+
+    z1 = (INT32) wsptr[2];
+    z2 = (INT32) wsptr[4];
+    z3 = (INT32) wsptr[6];
+
+    tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
+    tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
+    z4 = z1 + z3;
+    tmp24 = MULTIPLY(z4, - FIX(1.155664402));        /* -(c2-c10) */
+    z4 -= z2;
+    tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
+    tmp21 = tmp20 + tmp23 + tmp25 -
+	    MULTIPLY(z2, FIX(1.821790775));          /* c2+c4+c10-c6 */
+    tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
+    tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
+    tmp24 += tmp25;
+    tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120));  /* c8+c10 */
+    tmp24 += MULTIPLY(z2, FIX(1.944413522)) -        /* c2+c8 */
+	     MULTIPLY(z1, FIX(1.390975730));         /* c4+c10 */
+    tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562));  /* c0 */
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    z4 = (INT32) wsptr[7];
+
+    tmp11 = z1 + z2;
+    tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
+    tmp11 = MULTIPLY(tmp11, FIX(0.887983902));           /* c3-c9 */
+    tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295));         /* c5-c9 */
+    tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
+    tmp10 = tmp11 + tmp12 + tmp13 -
+	    MULTIPLY(z1, FIX(0.923107866));              /* c7+c5+c3-c1-2*c9 */
+    z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
+    tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
+    tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
+    z1    = MULTIPLY(z2 + z4, - FIX(1.798248910));       /* -(c1+c9) */
+    tmp11 += z1;
+    tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
+    tmp14 += MULTIPLY(z2, - FIX(1.467221301)) +          /* -(c5+c9) */
+	     MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
+	     MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
+
+    /* Final output stage */
+
+    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 12x12 output block.
+ *
+ * Optimized algorithm with 15 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/24).
+ */
+
+GLOBAL(void)
+jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		 JCOEFPTR coef_block,
+		 JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*12];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z3 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    z3 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
+
+    tmp10 = z3 + z4;
+    tmp11 = z3 - z4;
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
+    z1 <<= CONST_BITS;
+    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z2 <<= CONST_BITS;
+
+    tmp12 = z1 - z2;
+
+    tmp21 = z3 + tmp12;
+    tmp24 = z3 - tmp12;
+
+    tmp12 = z4 + z2;
+
+    tmp20 = tmp10 + tmp12;
+    tmp25 = tmp10 - tmp12;
+
+    tmp12 = z4 - z1 - z2;
+
+    tmp22 = tmp11 + tmp12;
+    tmp23 = tmp11 - tmp12;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+    tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
+    tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
+
+    tmp10 = z1 + z3;
+    tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
+    tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
+    tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
+    tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
+    tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
+    tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
+    tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
+	     MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
+
+    z1 -= z4;
+    z2 -= z3;
+    z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
+    tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
+    tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
+
+    /* Final output stage */
+
+    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*11] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*10] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
+    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 12 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 12; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z3 <<= CONST_BITS;
+
+    z4 = (INT32) wsptr[4];
+    z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
+
+    tmp10 = z3 + z4;
+    tmp11 = z3 - z4;
+
+    z1 = (INT32) wsptr[2];
+    z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
+    z1 <<= CONST_BITS;
+    z2 = (INT32) wsptr[6];
+    z2 <<= CONST_BITS;
+
+    tmp12 = z1 - z2;
+
+    tmp21 = z3 + tmp12;
+    tmp24 = z3 - tmp12;
+
+    tmp12 = z4 + z2;
+
+    tmp20 = tmp10 + tmp12;
+    tmp25 = tmp10 - tmp12;
+
+    tmp12 = z4 - z1 - z2;
+
+    tmp22 = tmp11 + tmp12;
+    tmp23 = tmp11 - tmp12;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    z4 = (INT32) wsptr[7];
+
+    tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
+    tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
+
+    tmp10 = z1 + z3;
+    tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
+    tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
+    tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
+    tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
+    tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
+    tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
+    tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
+	     MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
+
+    z1 -= z4;
+    z2 -= z3;
+    z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
+    tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
+    tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
+
+    /* Final output stage */
+
+    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 13x13 output block.
+ *
+ * Optimized algorithm with 29 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/26).
+ */
+
+GLOBAL(void)
+jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		 JCOEFPTR coef_block,
+		 JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*13];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z1 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+    tmp10 = z3 + z4;
+    tmp11 = z3 - z4;
+
+    tmp12 = MULTIPLY(tmp10, FIX(1.155388986));                /* (c4+c6)/2 */
+    tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1;           /* (c4-c6)/2 */
+
+    tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13;   /* c2 */
+    tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13;   /* c10 */
+
+    tmp12 = MULTIPLY(tmp10, FIX(0.316450131));                /* (c8-c12)/2 */
+    tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
+
+    tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
+    tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
+
+    tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
+    tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
+
+    tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
+    tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
+
+    tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+    tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
+    tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
+    tmp15 = z1 + z4;
+    tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
+    tmp10 = tmp11 + tmp12 + tmp13 -
+	    MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
+    tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458));   /* -c11 */
+    tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
+    tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
+    tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945));   /* -c5 */
+    tmp11 += tmp14;
+    tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
+    tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813));   /* -c9 */
+    tmp12 += tmp14;
+    tmp13 += tmp14;
+    tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
+    tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
+	    MULTIPLY(z2, FIX(0.466105296));          /* c1-c7 */
+    z1    = MULTIPLY(z3 - z2, FIX(0.937797057));     /* c7 */
+    tmp14 += z1;
+    tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) -   /* c3-c7 */
+	     MULTIPLY(z4, FIX(1.742345811));         /* c1+c11 */
+
+    /* Final output stage */
+
+    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*12] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*11] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*10] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
+    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
+    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 13 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 13; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z1 <<= CONST_BITS;
+
+    z2 = (INT32) wsptr[2];
+    z3 = (INT32) wsptr[4];
+    z4 = (INT32) wsptr[6];
+
+    tmp10 = z3 + z4;
+    tmp11 = z3 - z4;
+
+    tmp12 = MULTIPLY(tmp10, FIX(1.155388986));                /* (c4+c6)/2 */
+    tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1;           /* (c4-c6)/2 */
+
+    tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13;   /* c2 */
+    tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13;   /* c10 */
+
+    tmp12 = MULTIPLY(tmp10, FIX(0.316450131));                /* (c8-c12)/2 */
+    tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
+
+    tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
+    tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
+
+    tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
+    tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
+
+    tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
+    tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
+
+    tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    z4 = (INT32) wsptr[7];
+
+    tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
+    tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
+    tmp15 = z1 + z4;
+    tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
+    tmp10 = tmp11 + tmp12 + tmp13 -
+	    MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
+    tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458));   /* -c11 */
+    tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
+    tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
+    tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945));   /* -c5 */
+    tmp11 += tmp14;
+    tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
+    tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813));   /* -c9 */
+    tmp12 += tmp14;
+    tmp13 += tmp14;
+    tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
+    tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
+	    MULTIPLY(z2, FIX(0.466105296));          /* c1-c7 */
+    z1    = MULTIPLY(z3 - z2, FIX(0.937797057));     /* c7 */
+    tmp14 += z1;
+    tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) -   /* c3-c7 */
+	     MULTIPLY(z4, FIX(1.742345811));         /* c1+c11 */
+
+    /* Final output stage */
+
+    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 14x14 output block.
+ *
+ * Optimized algorithm with 20 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/28).
+ */
+
+GLOBAL(void)
+jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		 JCOEFPTR coef_block,
+		 JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*14];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z1 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
+    z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
+    z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
+
+    tmp10 = z1 + z2;
+    tmp11 = z1 + z3;
+    tmp12 = z1 - z4;
+
+    tmp23 = RIGHT_SHIFT(z1 - ((z2 + z3 - z4) << 1), /* c0 = (c4+c12-c8)*2 */
+			CONST_BITS-PASS1_BITS);
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+    z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
+
+    tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
+    tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
+    tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
+	    MULTIPLY(z2, FIX(1.378756276));      /* c2 */
+
+    tmp20 = tmp10 + tmp13;
+    tmp26 = tmp10 - tmp13;
+    tmp21 = tmp11 + tmp14;
+    tmp25 = tmp11 - tmp14;
+    tmp22 = tmp12 + tmp15;
+    tmp24 = tmp12 - tmp15;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    tmp13 = z4 << CONST_BITS;
+
+    tmp14 = z1 + z3;
+    tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
+    tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
+    tmp10 = tmp11 + tmp12 + tmp13 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
+    tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
+    tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
+    z1    -= z2;
+    tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13;        /* c11 */
+    tmp16 += tmp15;
+    z1    += z4;
+    z4    = MULTIPLY(z2 + z3, - FIX(0.158341681)) - tmp13; /* -c13 */
+    tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948));          /* c3-c9-c13 */
+    tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773));          /* c3+c5-c13 */
+    z4    = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
+    tmp14 += z4 + tmp13 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
+    tmp15 += z4 + MULTIPLY(z2, FIX(0.674957567));          /* c1+c11-c5 */
+
+    tmp13 = (z1 - z3) << PASS1_BITS;
+
+    /* Final output stage */
+
+    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*13] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*12] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*11] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*3]  = (int) (tmp23 + tmp13);
+    wsptr[8*10] = (int) (tmp23 - tmp13);
+    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
+    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
+    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
+    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 14 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 14; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z1 <<= CONST_BITS;
+    z4 = (INT32) wsptr[4];
+    z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
+    z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
+    z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
+
+    tmp10 = z1 + z2;
+    tmp11 = z1 + z3;
+    tmp12 = z1 - z4;
+
+    tmp23 = z1 - ((z2 + z3 - z4) << 1);          /* c0 = (c4+c12-c8)*2 */
+
+    z1 = (INT32) wsptr[2];
+    z2 = (INT32) wsptr[6];
+
+    z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
+
+    tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
+    tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
+    tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
+	    MULTIPLY(z2, FIX(1.378756276));      /* c2 */
+
+    tmp20 = tmp10 + tmp13;
+    tmp26 = tmp10 - tmp13;
+    tmp21 = tmp11 + tmp14;
+    tmp25 = tmp11 - tmp14;
+    tmp22 = tmp12 + tmp15;
+    tmp24 = tmp12 - tmp15;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    z4 = (INT32) wsptr[7];
+    z4 <<= CONST_BITS;
+
+    tmp14 = z1 + z3;
+    tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
+    tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
+    tmp10 = tmp11 + tmp12 + z4 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
+    tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
+    tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
+    z1    -= z2;
+    tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4;           /* c11 */
+    tmp16 += tmp15;
+    tmp13 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - z4;    /* -c13 */
+    tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948));       /* c3-c9-c13 */
+    tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773));       /* c3+c5-c13 */
+    tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
+    tmp14 += tmp13 + z4 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
+    tmp15 += tmp13 + MULTIPLY(z2, FIX(0.674957567));       /* c1+c11-c5 */
+
+    tmp13 = ((z1 - z3) << CONST_BITS) + z4;
+
+    /* Final output stage */
+
+    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 15x15 output block.
+ *
+ * Optimized algorithm with 22 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/30).
+ */
+
+GLOBAL(void)
+jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		 JCOEFPTR coef_block,
+		 JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*15];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z1 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+    tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
+    tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
+
+    tmp12 = z1 - tmp10;
+    tmp13 = z1 + tmp11;
+    z1 -= (tmp11 - tmp10) << 1;             /* c0 = (c6-c12)*2 */
+
+    z4 = z2 - z3;
+    z3 += z2;
+    tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
+    tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
+    z2 = MULTIPLY(z2, FIX(1.439773946));    /* c4+c14 */
+
+    tmp20 = tmp13 + tmp10 + tmp11;
+    tmp23 = tmp12 - tmp10 + tmp11 + z2;
+
+    tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
+    tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
+
+    tmp25 = tmp13 - tmp10 - tmp11;
+    tmp26 = tmp12 + tmp10 - tmp11 - z2;
+
+    tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
+    tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
+
+    tmp21 = tmp12 + tmp10 + tmp11;
+    tmp24 = tmp13 - tmp10 + tmp11;
+    tmp11 += tmp11;
+    tmp22 = z1 + tmp11;                     /* c10 = c6-c12 */
+    tmp27 = z1 - tmp11 - tmp11;             /* c0 = (c6-c12)*2 */
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+    tmp13 = z2 - z4;
+    tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
+    tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
+    tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
+
+    tmp13 = MULTIPLY(z2, - FIX(0.831253876));               /* -c9 */
+    tmp15 = MULTIPLY(z2, - FIX(1.344997024));               /* -c3 */
+    z2 = z1 - z4;
+    tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
+
+    tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
+    tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
+    tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3;            /* c5 */
+    z2 = MULTIPLY(z1 + z4, FIX(0.575212477));               /* c11 */
+    tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3;      /* c7-c11 */
+    tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3;      /* c11+c13 */
+
+    /* Final output stage */
+
+    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*14] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*13] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*12] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*11] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*10] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
+    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
+    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
+    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
+    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 15 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 15; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z1 <<= CONST_BITS;
+
+    z2 = (INT32) wsptr[2];
+    z3 = (INT32) wsptr[4];
+    z4 = (INT32) wsptr[6];
+
+    tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
+    tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
+
+    tmp12 = z1 - tmp10;
+    tmp13 = z1 + tmp11;
+    z1 -= (tmp11 - tmp10) << 1;             /* c0 = (c6-c12)*2 */
+
+    z4 = z2 - z3;
+    z3 += z2;
+    tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
+    tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
+    z2 = MULTIPLY(z2, FIX(1.439773946));    /* c4+c14 */
+
+    tmp20 = tmp13 + tmp10 + tmp11;
+    tmp23 = tmp12 - tmp10 + tmp11 + z2;
+
+    tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
+    tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
+
+    tmp25 = tmp13 - tmp10 - tmp11;
+    tmp26 = tmp12 + tmp10 - tmp11 - z2;
+
+    tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
+    tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
+
+    tmp21 = tmp12 + tmp10 + tmp11;
+    tmp24 = tmp13 - tmp10 + tmp11;
+    tmp11 += tmp11;
+    tmp22 = z1 + tmp11;                     /* c10 = c6-c12 */
+    tmp27 = z1 - tmp11 - tmp11;             /* c0 = (c6-c12)*2 */
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z4 = (INT32) wsptr[5];
+    z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
+    z4 = (INT32) wsptr[7];
+
+    tmp13 = z2 - z4;
+    tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
+    tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
+    tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
+
+    tmp13 = MULTIPLY(z2, - FIX(0.831253876));               /* -c9 */
+    tmp15 = MULTIPLY(z2, - FIX(1.344997024));               /* -c3 */
+    z2 = z1 - z4;
+    tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
+
+    tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
+    tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
+    tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3;            /* c5 */
+    z2 = MULTIPLY(z1 + z4, FIX(0.575212477));               /* c11 */
+    tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3;      /* c7-c11 */
+    tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3;      /* c11+c13 */
+
+    /* Final output stage */
+
+    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 16x16 output block.
+ *
+ * Optimized algorithm with 28 multiplications in the 1-D kernel.
+ * cK represents sqrt(2) * cos(K*pi/32).
+ */
+
+GLOBAL(void)
+jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		 JCOEFPTR coef_block,
+		 JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*16];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp0 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp0 += 1 << (CONST_BITS-PASS1_BITS-1);
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
+    tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
+
+    tmp10 = tmp0 + tmp1;
+    tmp11 = tmp0 - tmp1;
+    tmp12 = tmp0 + tmp2;
+    tmp13 = tmp0 - tmp2;
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z3 = z1 - z2;
+    z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
+    z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
+
+    tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
+    tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
+    tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
+    tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
+
+    tmp20 = tmp10 + tmp0;
+    tmp27 = tmp10 - tmp0;
+    tmp21 = tmp12 + tmp1;
+    tmp26 = tmp12 - tmp1;
+    tmp22 = tmp13 + tmp2;
+    tmp25 = tmp13 - tmp2;
+    tmp23 = tmp11 + tmp3;
+    tmp24 = tmp11 - tmp3;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+    tmp11 = z1 + z3;
+
+    tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
+    tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
+    tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
+    tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
+    tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
+    tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
+    tmp0  = tmp1 + tmp2 + tmp3 -
+	    MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
+    tmp13 = tmp10 + tmp11 + tmp12 -
+	    MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
+    z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
+    tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
+    tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
+    z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
+    tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
+    tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
+    z2    += z4;
+    z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
+    tmp1  += z1;
+    tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
+    z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
+    tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
+    tmp12 += z2;
+    z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
+    tmp2  += z2;
+    tmp3  += z2;
+    z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
+    tmp10 += z2;
+    tmp11 += z2;
+
+    /* Final output stage */
+
+    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp0,  CONST_BITS-PASS1_BITS);
+    wsptr[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0,  CONST_BITS-PASS1_BITS);
+    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp1,  CONST_BITS-PASS1_BITS);
+    wsptr[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1,  CONST_BITS-PASS1_BITS);
+    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp2,  CONST_BITS-PASS1_BITS);
+    wsptr[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2,  CONST_BITS-PASS1_BITS);
+    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp3,  CONST_BITS-PASS1_BITS);
+    wsptr[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3,  CONST_BITS-PASS1_BITS);
+    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 16 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 16; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 <<= CONST_BITS;
+
+    z1 = (INT32) wsptr[4];
+    tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
+    tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
+
+    tmp10 = tmp0 + tmp1;
+    tmp11 = tmp0 - tmp1;
+    tmp12 = tmp0 + tmp2;
+    tmp13 = tmp0 - tmp2;
+
+    z1 = (INT32) wsptr[2];
+    z2 = (INT32) wsptr[6];
+    z3 = z1 - z2;
+    z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
+    z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
+
+    tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
+    tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
+    tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
+    tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
+
+    tmp20 = tmp10 + tmp0;
+    tmp27 = tmp10 - tmp0;
+    tmp21 = tmp12 + tmp1;
+    tmp26 = tmp12 - tmp1;
+    tmp22 = tmp13 + tmp2;
+    tmp25 = tmp13 - tmp2;
+    tmp23 = tmp11 + tmp3;
+    tmp24 = tmp11 - tmp3;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    z4 = (INT32) wsptr[7];
+
+    tmp11 = z1 + z3;
+
+    tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
+    tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
+    tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
+    tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
+    tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
+    tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
+    tmp0  = tmp1 + tmp2 + tmp3 -
+	    MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
+    tmp13 = tmp10 + tmp11 + tmp12 -
+	    MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
+    z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
+    tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
+    tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
+    z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
+    tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
+    tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
+    z2    += z4;
+    z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
+    tmp1  += z1;
+    tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
+    z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
+    tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
+    tmp12 += z2;
+    z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
+    tmp2  += z2;
+    tmp3  += z2;
+    z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
+    tmp10 += z2;
+    tmp11 += z2;
+
+    /* Final output stage */
+
+    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp0,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[15] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp0,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp1,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp1,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp2,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp2,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp3,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp3,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27 + tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp27 - tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 16x8 output block.
+ *
+ * 8-point IDCT in pass 1 (columns), 16-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_16x8 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		JCOEFPTR coef_block,
+		JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*8];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = DCTSIZE; ctr > 0; ctr--) {
+    /* Due to quantization, we will usually find that many of the input
+     * coefficients are zero, especially the AC terms.  We can exploit this
+     * by short-circuiting the IDCT calculation for any column in which all
+     * the AC terms are zero.  In that case each output is equal to the
+     * DC coefficient (with scale factor as needed).
+     * With typical images and quantization tables, half or more of the
+     * column DCT calculations can be simplified this way.
+     */
+    
+    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
+	inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
+	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
+	inptr[DCTSIZE*7] == 0) {
+      /* AC terms all zero */
+      int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
+      
+      wsptr[DCTSIZE*0] = dcval;
+      wsptr[DCTSIZE*1] = dcval;
+      wsptr[DCTSIZE*2] = dcval;
+      wsptr[DCTSIZE*3] = dcval;
+      wsptr[DCTSIZE*4] = dcval;
+      wsptr[DCTSIZE*5] = dcval;
+      wsptr[DCTSIZE*6] = dcval;
+      wsptr[DCTSIZE*7] = dcval;
+      
+      inptr++;			/* advance pointers to next column */
+      quantptr++;
+      wsptr++;
+      continue;
+    }
+    
+    /* Even part: reverse the even part of the forward DCT. */
+    /* The rotator is sqrt(2)*c(-6). */
+    
+    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    
+    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
+    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
+    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
+    
+    z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z2 <<= CONST_BITS;
+    z3 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    z2 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    tmp0 = z2 + z3;
+    tmp1 = z2 - z3;
+    
+    tmp10 = tmp0 + tmp2;
+    tmp13 = tmp0 - tmp2;
+    tmp11 = tmp1 + tmp3;
+    tmp12 = tmp1 - tmp3;
+    
+    /* Odd part per figure 8; the matrix is unitary and hence its
+     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
+     */
+    
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    
+    z2 = tmp0 + tmp2;
+    z3 = tmp1 + tmp3;
+
+    z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
+    z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+    z2 += z1;
+    z3 += z1;
+
+    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
+    tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+    tmp0 += z1 + z2;
+    tmp3 += z1 + z3;
+
+    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+    tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
+    tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
+    tmp1 += z1 + z3;
+    tmp2 += z1 + z2;
+    
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+    
+    wsptr[DCTSIZE*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
+    
+    inptr++;			/* advance pointers to next column */
+    quantptr++;
+    wsptr++;
+  }
+
+  /* Pass 2: process 8 rows from work array, store into output array.
+   * 16-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
+   */
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 <<= CONST_BITS;
+
+    z1 = (INT32) wsptr[4];
+    tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
+    tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
+
+    tmp10 = tmp0 + tmp1;
+    tmp11 = tmp0 - tmp1;
+    tmp12 = tmp0 + tmp2;
+    tmp13 = tmp0 - tmp2;
+
+    z1 = (INT32) wsptr[2];
+    z2 = (INT32) wsptr[6];
+    z3 = z1 - z2;
+    z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
+    z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
+
+    tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
+    tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
+    tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
+    tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
+
+    tmp20 = tmp10 + tmp0;
+    tmp27 = tmp10 - tmp0;
+    tmp21 = tmp12 + tmp1;
+    tmp26 = tmp12 - tmp1;
+    tmp22 = tmp13 + tmp2;
+    tmp25 = tmp13 - tmp2;
+    tmp23 = tmp11 + tmp3;
+    tmp24 = tmp11 - tmp3;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    z4 = (INT32) wsptr[7];
+
+    tmp11 = z1 + z3;
+
+    tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
+    tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
+    tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
+    tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
+    tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
+    tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
+    tmp0  = tmp1 + tmp2 + tmp3 -
+	    MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
+    tmp13 = tmp10 + tmp11 + tmp12 -
+	    MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
+    z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
+    tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
+    tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
+    z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
+    tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
+    tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
+    z2    += z4;
+    z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
+    tmp1  += z1;
+    tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
+    z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
+    tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
+    tmp12 += z2;
+    z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
+    tmp2  += z2;
+    tmp3  += z2;
+    z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
+    tmp10 += z2;
+    tmp11 += z2;
+
+    /* Final output stage */
+
+    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp0,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[15] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp0,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp1,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp1,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp2,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp2,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp3,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp3,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27 + tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp27 - tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 14x7 output block.
+ *
+ * 7-point IDCT in pass 1 (columns), 14-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_14x7 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		JCOEFPTR coef_block,
+		JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*7];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array.
+   * 7-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/14).
+   */
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp23 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp23 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp23 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+    tmp20 = MULTIPLY(z2 - z3, FIX(0.881747734));       /* c4 */
+    tmp22 = MULTIPLY(z1 - z2, FIX(0.314692123));       /* c6 */
+    tmp21 = tmp20 + tmp22 + tmp23 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
+    tmp10 = z1 + z3;
+    z2 -= tmp10;
+    tmp10 = MULTIPLY(tmp10, FIX(1.274162392)) + tmp23; /* c2 */
+    tmp20 += tmp10 - MULTIPLY(z3, FIX(0.077722536));   /* c2-c4-c6 */
+    tmp22 += tmp10 - MULTIPLY(z1, FIX(2.470602249));   /* c2+c4+c6 */
+    tmp23 += MULTIPLY(z2, FIX(1.414213562));           /* c0 */
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+
+    tmp11 = MULTIPLY(z1 + z2, FIX(0.935414347));       /* (c3+c1-c5)/2 */
+    tmp12 = MULTIPLY(z1 - z2, FIX(0.170262339));       /* (c3+c5-c1)/2 */
+    tmp10 = tmp11 - tmp12;
+    tmp11 += tmp12;
+    tmp12 = MULTIPLY(z2 + z3, - FIX(1.378756276));     /* -c1 */
+    tmp11 += tmp12;
+    z2 = MULTIPLY(z1 + z3, FIX(0.613604268));          /* c5 */
+    tmp10 += z2;
+    tmp12 += z2 + MULTIPLY(z3, FIX(1.870828693));      /* c3+c1-c5 */
+
+    /* Final output stage */
+
+    wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*6] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*5] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*4] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*3] = (int) RIGHT_SHIFT(tmp23, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 7 rows from work array, store into output array.
+   * 14-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/28).
+   */
+  wsptr = workspace;
+  for (ctr = 0; ctr < 7; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z1 <<= CONST_BITS;
+    z4 = (INT32) wsptr[4];
+    z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
+    z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
+    z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
+
+    tmp10 = z1 + z2;
+    tmp11 = z1 + z3;
+    tmp12 = z1 - z4;
+
+    tmp23 = z1 - ((z2 + z3 - z4) << 1);          /* c0 = (c4+c12-c8)*2 */
+
+    z1 = (INT32) wsptr[2];
+    z2 = (INT32) wsptr[6];
+
+    z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
+
+    tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
+    tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
+    tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
+	    MULTIPLY(z2, FIX(1.378756276));      /* c2 */
+
+    tmp20 = tmp10 + tmp13;
+    tmp26 = tmp10 - tmp13;
+    tmp21 = tmp11 + tmp14;
+    tmp25 = tmp11 - tmp14;
+    tmp22 = tmp12 + tmp15;
+    tmp24 = tmp12 - tmp15;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    z4 = (INT32) wsptr[7];
+    z4 <<= CONST_BITS;
+
+    tmp14 = z1 + z3;
+    tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
+    tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
+    tmp10 = tmp11 + tmp12 + z4 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
+    tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
+    tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
+    z1    -= z2;
+    tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4;           /* c11 */
+    tmp16 += tmp15;
+    tmp13 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - z4;    /* -c13 */
+    tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948));       /* c3-c9-c13 */
+    tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773));       /* c3+c5-c13 */
+    tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
+    tmp14 += tmp13 + z4 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
+    tmp15 += tmp13 + MULTIPLY(z2, FIX(0.674957567));       /* c1+c11-c5 */
+
+    tmp13 = ((z1 - z3) << CONST_BITS) + z4;
+
+    /* Final output stage */
+
+    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 12x6 output block.
+ *
+ * 6-point IDCT in pass 1 (columns), 12-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_12x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		JCOEFPTR coef_block,
+		JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*6];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array.
+   * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
+   */
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp10 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp10 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp12 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    tmp20 = MULTIPLY(tmp12, FIX(0.707106781));   /* c4 */
+    tmp11 = tmp10 + tmp20;
+    tmp21 = RIGHT_SHIFT(tmp10 - tmp20 - tmp20, CONST_BITS-PASS1_BITS);
+    tmp20 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    tmp10 = MULTIPLY(tmp20, FIX(1.224744871));   /* c2 */
+    tmp20 = tmp11 + tmp10;
+    tmp22 = tmp11 - tmp10;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    tmp11 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
+    tmp10 = tmp11 + ((z1 + z2) << CONST_BITS);
+    tmp12 = tmp11 + ((z3 - z2) << CONST_BITS);
+    tmp11 = (z1 - z2 - z3) << PASS1_BITS;
+
+    /* Final output stage */
+
+    wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*5] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*1] = (int) (tmp21 + tmp11);
+    wsptr[8*4] = (int) (tmp21 - tmp11);
+    wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*3] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 6 rows from work array, store into output array.
+   * 12-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/24).
+   */
+  wsptr = workspace;
+  for (ctr = 0; ctr < 6; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z3 <<= CONST_BITS;
+
+    z4 = (INT32) wsptr[4];
+    z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
+
+    tmp10 = z3 + z4;
+    tmp11 = z3 - z4;
+
+    z1 = (INT32) wsptr[2];
+    z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
+    z1 <<= CONST_BITS;
+    z2 = (INT32) wsptr[6];
+    z2 <<= CONST_BITS;
+
+    tmp12 = z1 - z2;
+
+    tmp21 = z3 + tmp12;
+    tmp24 = z3 - tmp12;
+
+    tmp12 = z4 + z2;
+
+    tmp20 = tmp10 + tmp12;
+    tmp25 = tmp10 - tmp12;
+
+    tmp12 = z4 - z1 - z2;
+
+    tmp22 = tmp11 + tmp12;
+    tmp23 = tmp11 - tmp12;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    z4 = (INT32) wsptr[7];
+
+    tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
+    tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
+
+    tmp10 = z1 + z3;
+    tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
+    tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
+    tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
+    tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
+    tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
+    tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
+    tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
+	     MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
+
+    z1 -= z4;
+    z2 -= z3;
+    z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
+    tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
+    tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
+
+    /* Final output stage */
+
+    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
+					       CONST_BITS+PASS1_BITS+3)
+			     & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 10x5 output block.
+ *
+ * 5-point IDCT in pass 1 (columns), 10-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_10x5 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		JCOEFPTR coef_block,
+		JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*5];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array.
+   * 5-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/10).
+   */
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp12 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp12 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp12 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp13 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    tmp14 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z1 = MULTIPLY(tmp13 + tmp14, FIX(0.790569415)); /* (c2+c4)/2 */
+    z2 = MULTIPLY(tmp13 - tmp14, FIX(0.353553391)); /* (c2-c4)/2 */
+    z3 = tmp12 + z2;
+    tmp10 = z3 + z1;
+    tmp11 = z3 - z1;
+    tmp12 -= z2 << 2;
+
+    /* Odd part */
+
+    z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+
+    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));       /* c3 */
+    tmp13 = z1 + MULTIPLY(z2, FIX(0.513743148));    /* c1-c3 */
+    tmp14 = z1 - MULTIPLY(z3, FIX(2.176250899));    /* c1+c3 */
+
+    /* Final output stage */
+
+    wsptr[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*4] = (int) RIGHT_SHIFT(tmp10 - tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*3] = (int) RIGHT_SHIFT(tmp11 - tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8*2] = (int) RIGHT_SHIFT(tmp12, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 5 rows from work array, store into output array.
+   * 10-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/20).
+   */
+  wsptr = workspace;
+  for (ctr = 0; ctr < 5; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z3 <<= CONST_BITS;
+    z4 = (INT32) wsptr[4];
+    z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
+    z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
+    tmp10 = z3 + z1;
+    tmp11 = z3 - z2;
+
+    tmp22 = z3 - ((z1 - z2) << 1);               /* c0 = (c4-c8)*2 */
+
+    z2 = (INT32) wsptr[2];
+    z3 = (INT32) wsptr[6];
+
+    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
+    tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
+    tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
+
+    tmp20 = tmp10 + tmp12;
+    tmp24 = tmp10 - tmp12;
+    tmp21 = tmp11 + tmp13;
+    tmp23 = tmp11 - tmp13;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    z3 <<= CONST_BITS;
+    z4 = (INT32) wsptr[7];
+
+    tmp11 = z2 + z4;
+    tmp13 = z2 - z4;
+
+    tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
+
+    z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
+    z4 = z3 + tmp12;
+
+    tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
+    tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
+
+    z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
+    z4 = z3 - tmp12 - (tmp13 << (CONST_BITS - 1));
+
+    tmp12 = ((z1 - tmp13) << CONST_BITS) - z3;
+
+    tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
+    tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 8;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 8x4 output block.
+ *
+ * 4-point IDCT in pass 1 (columns), 8-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_8x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3;
+  INT32 tmp10, tmp11, tmp12, tmp13;
+  INT32 z1, z2, z3;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*4];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array.
+   * 4-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
+   */
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+
+    tmp10 = (tmp0 + tmp2) << PASS1_BITS;
+    tmp12 = (tmp0 - tmp2) << PASS1_BITS;
+
+    /* Odd part */
+    /* Same rotation as in the even part of the 8x8 LL&M IDCT */
+
+    z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+
+    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);               /* c6 */
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp0 = RIGHT_SHIFT(z1 + MULTIPLY(z2, FIX_0_765366865), /* c2-c6 */
+		       CONST_BITS-PASS1_BITS);
+    tmp2 = RIGHT_SHIFT(z1 - MULTIPLY(z3, FIX_1_847759065), /* c2+c6 */
+		       CONST_BITS-PASS1_BITS);
+
+    /* Final output stage */
+
+    wsptr[8*0] = (int) (tmp10 + tmp0);
+    wsptr[8*3] = (int) (tmp10 - tmp0);
+    wsptr[8*1] = (int) (tmp12 + tmp2);
+    wsptr[8*2] = (int) (tmp12 - tmp2);
+  }
+
+  /* Pass 2: process rows from work array, store into output array. */
+  /* Note that we must descale the results by a factor of 8 == 2**3, */
+  /* and also undo the PASS1_BITS scaling. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 4; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part: reverse the even part of the forward DCT. */
+    /* The rotator is sqrt(2)*c(-6). */
+
+    z2 = (INT32) wsptr[2];
+    z3 = (INT32) wsptr[6];
+    
+    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
+    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
+    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
+    
+    /* Add fudge factor here for final descale. */
+    z2 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z3 = (INT32) wsptr[4];
+    
+    tmp0 = (z2 + z3) << CONST_BITS;
+    tmp1 = (z2 - z3) << CONST_BITS;
+    
+    tmp10 = tmp0 + tmp2;
+    tmp13 = tmp0 - tmp2;
+    tmp11 = tmp1 + tmp3;
+    tmp12 = tmp1 - tmp3;
+
+    /* Odd part per figure 8; the matrix is unitary and hence its
+     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
+     */
+
+    tmp0 = (INT32) wsptr[7];
+    tmp1 = (INT32) wsptr[5];
+    tmp2 = (INT32) wsptr[3];
+    tmp3 = (INT32) wsptr[1];
+
+    z2 = tmp0 + tmp2;
+    z3 = tmp1 + tmp3;
+
+    z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
+    z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+    z2 += z1;
+    z3 += z1;
+
+    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
+    tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+    tmp0 += z1 + z2;
+    tmp3 += z1 + z3;
+
+    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+    tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
+    tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
+    tmp1 += z1 + z3;
+    tmp2 += z1 + z2;
+
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp3,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp3,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += DCTSIZE;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 6x3 output block.
+ *
+ * 3-point IDCT in pass 1 (columns), 6-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_6x3 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
+  INT32 z1, z2, z3;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[6*3];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array.
+   * 3-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/6).
+   */
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp0 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
+    tmp10 = tmp0 + tmp12;
+    tmp2 = tmp0 - tmp12 - tmp12;
+
+    /* Odd part */
+
+    tmp12 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
+
+    /* Final output stage */
+
+    wsptr[6*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[6*2] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[6*1] = (int) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
+  }
+  
+  /* Pass 2: process 3 rows from work array, store into output array.
+   * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
+   */
+  wsptr = workspace;
+  for (ctr = 0; ctr < 3; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 <<= CONST_BITS;
+    tmp2 = (INT32) wsptr[4];
+    tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
+    tmp1 = tmp0 + tmp10;
+    tmp11 = tmp0 - tmp10 - tmp10;
+    tmp10 = (INT32) wsptr[2];
+    tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
+    tmp10 = tmp1 + tmp0;
+    tmp12 = tmp1 - tmp0;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
+    tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
+    tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
+    tmp1 = (z1 - z2 - z3) << CONST_BITS;
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 6;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 4x2 output block.
+ *
+ * 2-point IDCT in pass 1 (columns), 4-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_4x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp2, tmp10, tmp12;
+  INT32 z1, z2, z3;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  INT32 * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  INT32 workspace[4*2];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 4; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+
+    /* Odd part */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+
+    /* Final output stage */
+
+    wsptr[4*0] = tmp10 + tmp0;
+    wsptr[4*1] = tmp10 - tmp0;
+  }
+
+  /* Pass 2: process 2 rows from work array, store into output array.
+   * 4-point IDCT kernel,
+   * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
+   */
+  wsptr = workspace;
+  for (ctr = 0; ctr < 2; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = wsptr[0] + (ONE << 2);
+    tmp2 = wsptr[2];
+
+    tmp10 = (tmp0 + tmp2) << CONST_BITS;
+    tmp12 = (tmp0 - tmp2) << CONST_BITS;
+
+    /* Odd part */
+    /* Same rotation as in the even part of the 8x8 LL&M IDCT */
+
+    z2 = wsptr[1];
+    z3 = wsptr[3];
+
+    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
+    tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
+    tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+					      CONST_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+					      CONST_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
+					      CONST_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
+					      CONST_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 4;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 2x1 output block.
+ *
+ * 1-point IDCT in pass 1 (columns), 2-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_2x1 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp10;
+  ISLOW_MULT_TYPE * quantptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  SHIFT_TEMPS
+
+  /* Pass 1: empty. */
+
+  /* Pass 2: process 1 row from input, store into output array. */
+
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  outptr = output_buf[0] + output_col;
+
+  /* Even part */
+
+  tmp10 = DEQUANTIZE(coef_block[0], quantptr[0]);
+  /* Add fudge factor here for final descale. */
+  tmp10 += ONE << 2;
+
+  /* Odd part */
+
+  tmp0 = DEQUANTIZE(coef_block[1], quantptr[1]);
+
+  /* Final output stage */
+
+  outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, 3) & RANGE_MASK];
+  outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, 3) & RANGE_MASK];
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 8x16 output block.
+ *
+ * 16-point IDCT in pass 1 (columns), 8-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_8x16 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		JCOEFPTR coef_block,
+		JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[8*16];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array.
+   * 16-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
+   */
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp0 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
+    tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
+
+    tmp10 = tmp0 + tmp1;
+    tmp11 = tmp0 - tmp1;
+    tmp12 = tmp0 + tmp2;
+    tmp13 = tmp0 - tmp2;
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z3 = z1 - z2;
+    z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
+    z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
+
+    tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
+    tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
+    tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
+    tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
+
+    tmp20 = tmp10 + tmp0;
+    tmp27 = tmp10 - tmp0;
+    tmp21 = tmp12 + tmp1;
+    tmp26 = tmp12 - tmp1;
+    tmp22 = tmp13 + tmp2;
+    tmp25 = tmp13 - tmp2;
+    tmp23 = tmp11 + tmp3;
+    tmp24 = tmp11 - tmp3;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+    tmp11 = z1 + z3;
+
+    tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
+    tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
+    tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
+    tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
+    tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
+    tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
+    tmp0  = tmp1 + tmp2 + tmp3 -
+	    MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
+    tmp13 = tmp10 + tmp11 + tmp12 -
+	    MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
+    z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
+    tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
+    tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
+    z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
+    tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
+    tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
+    z2    += z4;
+    z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
+    tmp1  += z1;
+    tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
+    z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
+    tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
+    tmp12 += z2;
+    z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
+    tmp2  += z2;
+    tmp3  += z2;
+    z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
+    tmp10 += z2;
+    tmp11 += z2;
+
+    /* Final output stage */
+
+    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp0,  CONST_BITS-PASS1_BITS);
+    wsptr[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0,  CONST_BITS-PASS1_BITS);
+    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp1,  CONST_BITS-PASS1_BITS);
+    wsptr[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1,  CONST_BITS-PASS1_BITS);
+    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp2,  CONST_BITS-PASS1_BITS);
+    wsptr[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2,  CONST_BITS-PASS1_BITS);
+    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp3,  CONST_BITS-PASS1_BITS);
+    wsptr[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3,  CONST_BITS-PASS1_BITS);
+    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
+  }
+  
+  /* Pass 2: process rows from work array, store into output array. */
+  /* Note that we must descale the results by a factor of 8 == 2**3, */
+  /* and also undo the PASS1_BITS scaling. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 16; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+    
+    /* Even part: reverse the even part of the forward DCT. */
+    /* The rotator is sqrt(2)*c(-6). */
+    
+    z2 = (INT32) wsptr[2];
+    z3 = (INT32) wsptr[6];
+    
+    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
+    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
+    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
+    
+    /* Add fudge factor here for final descale. */
+    z2 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z3 = (INT32) wsptr[4];
+    
+    tmp0 = (z2 + z3) << CONST_BITS;
+    tmp1 = (z2 - z3) << CONST_BITS;
+    
+    tmp10 = tmp0 + tmp2;
+    tmp13 = tmp0 - tmp2;
+    tmp11 = tmp1 + tmp3;
+    tmp12 = tmp1 - tmp3;
+    
+    /* Odd part per figure 8; the matrix is unitary and hence its
+     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
+     */
+    
+    tmp0 = (INT32) wsptr[7];
+    tmp1 = (INT32) wsptr[5];
+    tmp2 = (INT32) wsptr[3];
+    tmp3 = (INT32) wsptr[1];
+    
+    z2 = tmp0 + tmp2;
+    z3 = tmp1 + tmp3;
+
+    z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
+    z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+    z2 += z1;
+    z3 += z1;
+
+    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
+    tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+    tmp0 += z1 + z2;
+    tmp3 += z1 + z3;
+
+    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+    tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
+    tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
+    tmp1 += z1 + z3;
+    tmp2 += z1 + z2;
+    
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+    
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp3,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp3,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp1,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    
+    wsptr += DCTSIZE;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 7x14 output block.
+ *
+ * 14-point IDCT in pass 1 (columns), 7-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_7x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		JCOEFPTR coef_block,
+		JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[7*14];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array.
+   * 14-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/28).
+   */
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z1 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
+    z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
+    z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
+
+    tmp10 = z1 + z2;
+    tmp11 = z1 + z3;
+    tmp12 = z1 - z4;
+
+    tmp23 = RIGHT_SHIFT(z1 - ((z2 + z3 - z4) << 1), /* c0 = (c4+c12-c8)*2 */
+			CONST_BITS-PASS1_BITS);
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+    z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
+
+    tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
+    tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
+    tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
+	    MULTIPLY(z2, FIX(1.378756276));      /* c2 */
+
+    tmp20 = tmp10 + tmp13;
+    tmp26 = tmp10 - tmp13;
+    tmp21 = tmp11 + tmp14;
+    tmp25 = tmp11 - tmp14;
+    tmp22 = tmp12 + tmp15;
+    tmp24 = tmp12 - tmp15;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    tmp13 = z4 << CONST_BITS;
+
+    tmp14 = z1 + z3;
+    tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
+    tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
+    tmp10 = tmp11 + tmp12 + tmp13 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
+    tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
+    tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
+    z1    -= z2;
+    tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13;        /* c11 */
+    tmp16 += tmp15;
+    z1    += z4;
+    z4    = MULTIPLY(z2 + z3, - FIX(0.158341681)) - tmp13; /* -c13 */
+    tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948));          /* c3-c9-c13 */
+    tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773));          /* c3+c5-c13 */
+    z4    = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
+    tmp14 += z4 + tmp13 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
+    tmp15 += z4 + MULTIPLY(z2, FIX(0.674957567));          /* c1+c11-c5 */
+
+    tmp13 = (z1 - z3) << PASS1_BITS;
+
+    /* Final output stage */
+
+    wsptr[7*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[7*13] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[7*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[7*12] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[7*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[7*11] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[7*3]  = (int) (tmp23 + tmp13);
+    wsptr[7*10] = (int) (tmp23 - tmp13);
+    wsptr[7*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[7*9]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[7*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
+    wsptr[7*8]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
+    wsptr[7*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
+    wsptr[7*7]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 14 rows from work array, store into output array.
+   * 7-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/14).
+   */
+  wsptr = workspace;
+  for (ctr = 0; ctr < 14; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp23 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp23 <<= CONST_BITS;
+
+    z1 = (INT32) wsptr[2];
+    z2 = (INT32) wsptr[4];
+    z3 = (INT32) wsptr[6];
+
+    tmp20 = MULTIPLY(z2 - z3, FIX(0.881747734));       /* c4 */
+    tmp22 = MULTIPLY(z1 - z2, FIX(0.314692123));       /* c6 */
+    tmp21 = tmp20 + tmp22 + tmp23 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
+    tmp10 = z1 + z3;
+    z2 -= tmp10;
+    tmp10 = MULTIPLY(tmp10, FIX(1.274162392)) + tmp23; /* c2 */
+    tmp20 += tmp10 - MULTIPLY(z3, FIX(0.077722536));   /* c2-c4-c6 */
+    tmp22 += tmp10 - MULTIPLY(z1, FIX(2.470602249));   /* c2+c4+c6 */
+    tmp23 += MULTIPLY(z2, FIX(1.414213562));           /* c0 */
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+
+    tmp11 = MULTIPLY(z1 + z2, FIX(0.935414347));       /* (c3+c1-c5)/2 */
+    tmp12 = MULTIPLY(z1 - z2, FIX(0.170262339));       /* (c3+c5-c1)/2 */
+    tmp10 = tmp11 - tmp12;
+    tmp11 += tmp12;
+    tmp12 = MULTIPLY(z2 + z3, - FIX(1.378756276));     /* -c1 */
+    tmp11 += tmp12;
+    z2 = MULTIPLY(z1 + z3, FIX(0.613604268));          /* c5 */
+    tmp10 += z2;
+    tmp12 += z2 + MULTIPLY(z3, FIX(1.870828693));      /* c3+c1-c5 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 7;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 6x12 output block.
+ *
+ * 12-point IDCT in pass 1 (columns), 6-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_6x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		JCOEFPTR coef_block,
+		JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
+  INT32 z1, z2, z3, z4;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[6*12];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array.
+   * 12-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/24).
+   */
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z3 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    z3 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
+
+    tmp10 = z3 + z4;
+    tmp11 = z3 - z4;
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
+    z1 <<= CONST_BITS;
+    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z2 <<= CONST_BITS;
+
+    tmp12 = z1 - z2;
+
+    tmp21 = z3 + tmp12;
+    tmp24 = z3 - tmp12;
+
+    tmp12 = z4 + z2;
+
+    tmp20 = tmp10 + tmp12;
+    tmp25 = tmp10 - tmp12;
+
+    tmp12 = z4 - z1 - z2;
+
+    tmp22 = tmp11 + tmp12;
+    tmp23 = tmp11 - tmp12;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+    tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
+    tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
+
+    tmp10 = z1 + z3;
+    tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
+    tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
+    tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
+    tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
+    tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
+    tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
+    tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
+	     MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
+
+    z1 -= z4;
+    z2 -= z3;
+    z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
+    tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
+    tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
+
+    /* Final output stage */
+
+    wsptr[6*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[6*11] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[6*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[6*10] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[6*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[6*9]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[6*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[6*8]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[6*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[6*7]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[6*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
+    wsptr[6*6]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 12 rows from work array, store into output array.
+   * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
+   */
+  wsptr = workspace;
+  for (ctr = 0; ctr < 12; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp10 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp10 <<= CONST_BITS;
+    tmp12 = (INT32) wsptr[4];
+    tmp20 = MULTIPLY(tmp12, FIX(0.707106781));   /* c4 */
+    tmp11 = tmp10 + tmp20;
+    tmp21 = tmp10 - tmp20 - tmp20;
+    tmp20 = (INT32) wsptr[2];
+    tmp10 = MULTIPLY(tmp20, FIX(1.224744871));   /* c2 */
+    tmp20 = tmp11 + tmp10;
+    tmp22 = tmp11 - tmp10;
+
+    /* Odd part */
+
+    z1 = (INT32) wsptr[1];
+    z2 = (INT32) wsptr[3];
+    z3 = (INT32) wsptr[5];
+    tmp11 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
+    tmp10 = tmp11 + ((z1 + z2) << CONST_BITS);
+    tmp12 = tmp11 + ((z3 - z2) << CONST_BITS);
+    tmp11 = (z1 - z2 - z3) << CONST_BITS;
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 6;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 5x10 output block.
+ *
+ * 10-point IDCT in pass 1 (columns), 5-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_5x10 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		JCOEFPTR coef_block,
+		JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
+  INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
+  INT32 z1, z2, z3, z4, z5;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[5*10];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array.
+   * 10-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/20).
+   */
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z3 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    z3 += ONE << (CONST_BITS-PASS1_BITS-1);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
+    z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
+    tmp10 = z3 + z1;
+    tmp11 = z3 - z2;
+
+    tmp22 = RIGHT_SHIFT(z3 - ((z1 - z2) << 1),   /* c0 = (c4-c8)*2 */
+			CONST_BITS-PASS1_BITS);
+
+    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
+    tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
+    tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
+
+    tmp20 = tmp10 + tmp12;
+    tmp24 = tmp10 - tmp12;
+    tmp21 = tmp11 + tmp13;
+    tmp23 = tmp11 - tmp13;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+    tmp11 = z2 + z4;
+    tmp13 = z2 - z4;
+
+    tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
+    z5 = z3 << CONST_BITS;
+
+    z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
+    z4 = z5 + tmp12;
+
+    tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
+    tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
+
+    z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
+    z4 = z5 - tmp12 - (tmp13 << (CONST_BITS - 1));
+
+    tmp12 = (z1 - tmp13 - z3) << PASS1_BITS;
+
+    tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
+    tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
+
+    /* Final output stage */
+
+    wsptr[5*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[5*9] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
+    wsptr[5*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[5*8] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
+    wsptr[5*2] = (int) (tmp22 + tmp12);
+    wsptr[5*7] = (int) (tmp22 - tmp12);
+    wsptr[5*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[5*6] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[5*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[5*5] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 10 rows from work array, store into output array.
+   * 5-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/10).
+   */
+  wsptr = workspace;
+  for (ctr = 0; ctr < 10; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp12 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp12 <<= CONST_BITS;
+    tmp13 = (INT32) wsptr[2];
+    tmp14 = (INT32) wsptr[4];
+    z1 = MULTIPLY(tmp13 + tmp14, FIX(0.790569415)); /* (c2+c4)/2 */
+    z2 = MULTIPLY(tmp13 - tmp14, FIX(0.353553391)); /* (c2-c4)/2 */
+    z3 = tmp12 + z2;
+    tmp10 = z3 + z1;
+    tmp11 = z3 - z1;
+    tmp12 -= z2 << 2;
+
+    /* Odd part */
+
+    z2 = (INT32) wsptr[1];
+    z3 = (INT32) wsptr[3];
+
+    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));       /* c3 */
+    tmp13 = z1 + MULTIPLY(z2, FIX(0.513743148));    /* c1-c3 */
+    tmp14 = z1 - MULTIPLY(z3, FIX(2.176250899));    /* c1+c3 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp13,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp13,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp14,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp14,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 5;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 4x8 output block.
+ *
+ * 8-point IDCT in pass 1 (columns), 4-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_4x8 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3;
+  INT32 tmp10, tmp11, tmp12, tmp13;
+  INT32 z1, z2, z3;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[4*8];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array. */
+  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 4; ctr > 0; ctr--) {
+    /* Due to quantization, we will usually find that many of the input
+     * coefficients are zero, especially the AC terms.  We can exploit this
+     * by short-circuiting the IDCT calculation for any column in which all
+     * the AC terms are zero.  In that case each output is equal to the
+     * DC coefficient (with scale factor as needed).
+     * With typical images and quantization tables, half or more of the
+     * column DCT calculations can be simplified this way.
+     */
+
+    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
+	inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
+	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
+	inptr[DCTSIZE*7] == 0) {
+      /* AC terms all zero */
+      int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
+
+      wsptr[4*0] = dcval;
+      wsptr[4*1] = dcval;
+      wsptr[4*2] = dcval;
+      wsptr[4*3] = dcval;
+      wsptr[4*4] = dcval;
+      wsptr[4*5] = dcval;
+      wsptr[4*6] = dcval;
+      wsptr[4*7] = dcval;
+
+      inptr++;			/* advance pointers to next column */
+      quantptr++;
+      wsptr++;
+      continue;
+    }
+
+    /* Even part: reverse the even part of the forward DCT. */
+    /* The rotator is sqrt(2)*c(-6). */
+
+    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    
+    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
+    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
+    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
+    
+    z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z2 <<= CONST_BITS;
+    z3 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    z2 += ONE << (CONST_BITS-PASS1_BITS-1);
+
+    tmp0 = z2 + z3;
+    tmp1 = z2 - z3;
+    
+    tmp10 = tmp0 + tmp2;
+    tmp13 = tmp0 - tmp2;
+    tmp11 = tmp1 + tmp3;
+    tmp12 = tmp1 - tmp3;
+
+    /* Odd part per figure 8; the matrix is unitary and hence its
+     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
+     */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+
+    z2 = tmp0 + tmp2;
+    z3 = tmp1 + tmp3;
+
+    z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
+    z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+    z2 += z1;
+    z3 += z1;
+
+    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
+    tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+    tmp0 += z1 + z2;
+    tmp3 += z1 + z3;
+
+    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+    tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
+    tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
+    tmp1 += z1 + z3;
+    tmp2 += z1 + z2;
+
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+    wsptr[4*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
+    wsptr[4*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
+    wsptr[4*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[4*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[4*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[4*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
+    wsptr[4*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[4*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
+
+    inptr++;			/* advance pointers to next column */
+    quantptr++;
+    wsptr++;
+  }
+
+  /* Pass 2: process 8 rows from work array, store into output array.
+   * 4-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
+   */
+  wsptr = workspace;
+  for (ctr = 0; ctr < 8; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp2 = (INT32) wsptr[2];
+
+    tmp10 = (tmp0 + tmp2) << CONST_BITS;
+    tmp12 = (tmp0 - tmp2) << CONST_BITS;
+
+    /* Odd part */
+    /* Same rotation as in the even part of the 8x8 LL&M IDCT */
+
+    z2 = (INT32) wsptr[1];
+    z3 = (INT32) wsptr[3];
+
+    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
+    tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
+    tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    
+    wsptr += 4;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a reduced-size 3x6 output block.
+ *
+ * 6-point IDCT in pass 1 (columns), 3-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_3x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
+  INT32 z1, z2, z3;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  int * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  int workspace[3*6];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array.
+   * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
+   */
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp0 <<= CONST_BITS;
+    /* Add fudge factor here for final descale. */
+    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
+    tmp1 = tmp0 + tmp10;
+    tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS-PASS1_BITS);
+    tmp10 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
+    tmp10 = tmp1 + tmp0;
+    tmp12 = tmp1 - tmp0;
+
+    /* Odd part */
+
+    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
+    tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
+    tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
+    tmp1 = (z1 - z2 - z3) << PASS1_BITS;
+
+    /* Final output stage */
+
+    wsptr[3*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[3*5] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[3*1] = (int) (tmp11 + tmp1);
+    wsptr[3*4] = (int) (tmp11 - tmp1);
+    wsptr[3*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[3*3] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
+  }
+
+  /* Pass 2: process 6 rows from work array, store into output array.
+   * 3-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/6).
+   */
+  wsptr = workspace;
+  for (ctr = 0; ctr < 6; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 <<= CONST_BITS;
+    tmp2 = (INT32) wsptr[2];
+    tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
+    tmp10 = tmp0 + tmp12;
+    tmp2 = tmp0 - tmp12 - tmp12;
+
+    /* Odd part */
+
+    tmp12 = (INT32) wsptr[1];
+    tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2,
+					      CONST_BITS+PASS1_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 3;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 2x4 output block.
+ *
+ * 4-point IDCT in pass 1 (columns), 2-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_2x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp2, tmp10, tmp12;
+  INT32 z1, z2, z3;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE * quantptr;
+  INT32 * wsptr;
+  JSAMPROW outptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  int ctr;
+  INT32 workspace[2*4];	/* buffers data between passes */
+  SHIFT_TEMPS
+
+  /* Pass 1: process columns from input, store into work array.
+   * 4-point IDCT kernel,
+   * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
+   */
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  wsptr = workspace;
+  for (ctr = 0; ctr < 2; ctr++, inptr++, quantptr++, wsptr++) {
+    /* Even part */
+
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+
+    tmp10 = (tmp0 + tmp2) << CONST_BITS;
+    tmp12 = (tmp0 - tmp2) << CONST_BITS;
+
+    /* Odd part */
+    /* Same rotation as in the even part of the 8x8 LL&M IDCT */
+
+    z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+
+    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
+    tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
+    tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
+
+    /* Final output stage */
+
+    wsptr[2*0] = tmp10 + tmp0;
+    wsptr[2*3] = tmp10 - tmp0;
+    wsptr[2*1] = tmp12 + tmp2;
+    wsptr[2*2] = tmp12 - tmp2;
+  }
+
+  /* Pass 2: process 4 rows from work array, store into output array. */
+
+  wsptr = workspace;
+  for (ctr = 0; ctr < 4; ctr++) {
+    outptr = output_buf[ctr] + output_col;
+
+    /* Even part */
+
+    /* Add fudge factor here for final descale. */
+    tmp10 = wsptr[0] + (ONE << (CONST_BITS+2));
+
+    /* Odd part */
+
+    tmp0 = wsptr[1];
+
+    /* Final output stage */
+
+    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS+3)
+			    & RANGE_MASK];
+    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS+3)
+			    & RANGE_MASK];
+
+    wsptr += 2;		/* advance pointer to next row */
+  }
+}
+
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients,
+ * producing a 1x2 output block.
+ *
+ * 2-point IDCT in pass 1 (columns), 1-point in pass 2 (rows).
+ */
+
+GLOBAL(void)
+jpeg_idct_1x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JCOEFPTR coef_block,
+	       JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  INT32 tmp0, tmp10;
+  ISLOW_MULT_TYPE * quantptr;
+  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  SHIFT_TEMPS
+
+  /* Process 1 column from input, store into output array. */
+
+  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+
+  /* Even part */
+    
+  tmp10 = DEQUANTIZE(coef_block[DCTSIZE*0], quantptr[DCTSIZE*0]);
+  /* Add fudge factor here for final descale. */
+  tmp10 += ONE << 2;
+
+  /* Odd part */
+
+  tmp0 = DEQUANTIZE(coef_block[DCTSIZE*1], quantptr[DCTSIZE*1]);
+
+  /* Final output stage */
+
+  output_buf[0][output_col] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, 3)
+					  & RANGE_MASK];
+  output_buf[1][output_col] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, 3)
+					  & RANGE_MASK];
+}
+
+#endif /* IDCT_SCALING_SUPPORTED */
 #endif /* DCT_ISLOW_SUPPORTED */
diff --git a/3rdparty/libjpeg/jidctred.c b/3rdparty/libjpeg/jidctred.c
deleted file mode 100644
index 421f3c7ca1..0000000000
--- a/3rdparty/libjpeg/jidctred.c
+++ /dev/null
@@ -1,398 +0,0 @@
-/*
- * jidctred.c
- *
- * Copyright (C) 1994-1998, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file contains inverse-DCT routines that produce reduced-size output:
- * either 4x4, 2x2, or 1x1 pixels from an 8x8 DCT block.
- *
- * The implementation is based on the Loeffler, Ligtenberg and Moschytz (LL&M)
- * algorithm used in jidctint.c.  We simply replace each 8-to-8 1-D IDCT step
- * with an 8-to-4 step that produces the four averages of two adjacent outputs
- * (or an 8-to-2 step producing two averages of four outputs, for 2x2 output).
- * These steps were derived by computing the corresponding values at the end
- * of the normal LL&M code, then simplifying as much as possible.
- *
- * 1x1 is trivial: just take the DC coefficient divided by 8.
- *
- * See jidctint.c for additional comments.
- */
-
-#define JPEG_INTERNALS
-#include "jinclude.h"
-#include "jpeglib.h"
-#include "jdct.h"		/* Private declarations for DCT subsystem */
-
-#ifdef IDCT_SCALING_SUPPORTED
-
-
-/*
- * This module is specialized to the case DCTSIZE = 8.
- */
-
-#if DCTSIZE != 8
-  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
-#endif
-
-
-/* Scaling is the same as in jidctint.c. */
-
-#if BITS_IN_JSAMPLE == 8
-#define CONST_BITS  13
-#define PASS1_BITS  2
-#else
-#define CONST_BITS  13
-#define PASS1_BITS  1		/* lose a little precision to avoid overflow */
-#endif
-
-/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
- * causing a lot of useless floating-point operations at run time.
- * To get around this we use the following pre-calculated constants.
- * If you change CONST_BITS you may want to add appropriate values.
- * (With a reasonable C compiler, you can just rely on the FIX() macro...)
- */
-
-#if CONST_BITS == 13
-#define FIX_0_211164243  ((INT32)  1730)	/* FIX(0.211164243) */
-#define FIX_0_509795579  ((INT32)  4176)	/* FIX(0.509795579) */
-#define FIX_0_601344887  ((INT32)  4926)	/* FIX(0.601344887) */
-#define FIX_0_720959822  ((INT32)  5906)	/* FIX(0.720959822) */
-#define FIX_0_765366865  ((INT32)  6270)	/* FIX(0.765366865) */
-#define FIX_0_850430095  ((INT32)  6967)	/* FIX(0.850430095) */
-#define FIX_0_899976223  ((INT32)  7373)	/* FIX(0.899976223) */
-#define FIX_1_061594337  ((INT32)  8697)	/* FIX(1.061594337) */
-#define FIX_1_272758580  ((INT32)  10426)	/* FIX(1.272758580) */
-#define FIX_1_451774981  ((INT32)  11893)	/* FIX(1.451774981) */
-#define FIX_1_847759065  ((INT32)  15137)	/* FIX(1.847759065) */
-#define FIX_2_172734803  ((INT32)  17799)	/* FIX(2.172734803) */
-#define FIX_2_562915447  ((INT32)  20995)	/* FIX(2.562915447) */
-#define FIX_3_624509785  ((INT32)  29692)	/* FIX(3.624509785) */
-#else
-#define FIX_0_211164243  FIX(0.211164243)
-#define FIX_0_509795579  FIX(0.509795579)
-#define FIX_0_601344887  FIX(0.601344887)
-#define FIX_0_720959822  FIX(0.720959822)
-#define FIX_0_765366865  FIX(0.765366865)
-#define FIX_0_850430095  FIX(0.850430095)
-#define FIX_0_899976223  FIX(0.899976223)
-#define FIX_1_061594337  FIX(1.061594337)
-#define FIX_1_272758580  FIX(1.272758580)
-#define FIX_1_451774981  FIX(1.451774981)
-#define FIX_1_847759065  FIX(1.847759065)
-#define FIX_2_172734803  FIX(2.172734803)
-#define FIX_2_562915447  FIX(2.562915447)
-#define FIX_3_624509785  FIX(3.624509785)
-#endif
-
-
-/* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
- * For 8-bit samples with the recommended scaling, all the variable
- * and constant values involved are no more than 16 bits wide, so a
- * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
- * For 12-bit samples, a full 32-bit multiplication will be needed.
- */
-
-#if BITS_IN_JSAMPLE == 8
-#define MULTIPLY(var,const)  MULTIPLY16C16(var,const)
-#else
-#define MULTIPLY(var,const)  ((var) * (const))
-#endif
-
-
-/* Dequantize a coefficient by multiplying it by the multiplier-table
- * entry; produce an int result.  In this module, both inputs and result
- * are 16 bits or less, so either int or short multiply will work.
- */
-
-#define DEQUANTIZE(coef,quantval)  (((ISLOW_MULT_TYPE) (coef)) * (quantval))
-
-
-/*
- * Perform dequantization and inverse DCT on one block of coefficients,
- * producing a reduced-size 4x4 output block.
- */
-
-GLOBAL(void)
-jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	       JCOEFPTR coef_block,
-	       JSAMPARRAY output_buf, JDIMENSION output_col)
-{
-  INT32 tmp0, tmp2, tmp10, tmp12;
-  INT32 z1, z2, z3, z4;
-  JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
-  JSAMPROW outptr;
-  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
-  int ctr;
-  int workspace[DCTSIZE*4];	/* buffers data between passes */
-  SHIFT_TEMPS
-
-  /* Pass 1: process columns from input, store into work array. */
-
-  inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
-  wsptr = workspace;
-  for (ctr = DCTSIZE; ctr > 0; inptr++, quantptr++, wsptr++, ctr--) {
-    /* Don't bother to process column 4, because second pass won't use it */
-    if (ctr == DCTSIZE-4)
-      continue;
-    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
-	inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*5] == 0 &&
-	inptr[DCTSIZE*6] == 0 && inptr[DCTSIZE*7] == 0) {
-      /* AC terms all zero; we need not examine term 4 for 4x4 output */
-      int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
-      
-      wsptr[DCTSIZE*0] = dcval;
-      wsptr[DCTSIZE*1] = dcval;
-      wsptr[DCTSIZE*2] = dcval;
-      wsptr[DCTSIZE*3] = dcval;
-      
-      continue;
-    }
-    
-    /* Even part */
-    
-    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    tmp0 <<= (CONST_BITS+1);
-    
-    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
-
-    tmp2 = MULTIPLY(z2, FIX_1_847759065) + MULTIPLY(z3, - FIX_0_765366865);
-    
-    tmp10 = tmp0 + tmp2;
-    tmp12 = tmp0 - tmp2;
-    
-    /* Odd part */
-    
-    z1 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    
-    tmp0 = MULTIPLY(z1, - FIX_0_211164243) /* sqrt(2) * (c3-c1) */
-	 + MULTIPLY(z2, FIX_1_451774981) /* sqrt(2) * (c3+c7) */
-	 + MULTIPLY(z3, - FIX_2_172734803) /* sqrt(2) * (-c1-c5) */
-	 + MULTIPLY(z4, FIX_1_061594337); /* sqrt(2) * (c5+c7) */
-    
-    tmp2 = MULTIPLY(z1, - FIX_0_509795579) /* sqrt(2) * (c7-c5) */
-	 + MULTIPLY(z2, - FIX_0_601344887) /* sqrt(2) * (c5-c1) */
-	 + MULTIPLY(z3, FIX_0_899976223) /* sqrt(2) * (c3-c7) */
-	 + MULTIPLY(z4, FIX_2_562915447); /* sqrt(2) * (c1+c3) */
-
-    /* Final output stage */
-    
-    wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp2, CONST_BITS-PASS1_BITS+1);
-    wsptr[DCTSIZE*3] = (int) DESCALE(tmp10 - tmp2, CONST_BITS-PASS1_BITS+1);
-    wsptr[DCTSIZE*1] = (int) DESCALE(tmp12 + tmp0, CONST_BITS-PASS1_BITS+1);
-    wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 - tmp0, CONST_BITS-PASS1_BITS+1);
-  }
-  
-  /* Pass 2: process 4 rows from work array, store into output array. */
-
-  wsptr = workspace;
-  for (ctr = 0; ctr < 4; ctr++) {
-    outptr = output_buf[ctr] + output_col;
-    /* It's not clear whether a zero row test is worthwhile here ... */
-
-#ifndef NO_ZERO_ROW_TEST
-    if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 &&
-	wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
-      /* AC terms all zero */
-      JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3)
-				  & RANGE_MASK];
-      
-      outptr[0] = dcval;
-      outptr[1] = dcval;
-      outptr[2] = dcval;
-      outptr[3] = dcval;
-      
-      wsptr += DCTSIZE;		/* advance pointer to next row */
-      continue;
-    }
-#endif
-    
-    /* Even part */
-    
-    tmp0 = ((INT32) wsptr[0]) << (CONST_BITS+1);
-    
-    tmp2 = MULTIPLY((INT32) wsptr[2], FIX_1_847759065)
-	 + MULTIPLY((INT32) wsptr[6], - FIX_0_765366865);
-    
-    tmp10 = tmp0 + tmp2;
-    tmp12 = tmp0 - tmp2;
-    
-    /* Odd part */
-    
-    z1 = (INT32) wsptr[7];
-    z2 = (INT32) wsptr[5];
-    z3 = (INT32) wsptr[3];
-    z4 = (INT32) wsptr[1];
-    
-    tmp0 = MULTIPLY(z1, - FIX_0_211164243) /* sqrt(2) * (c3-c1) */
-	 + MULTIPLY(z2, FIX_1_451774981) /* sqrt(2) * (c3+c7) */
-	 + MULTIPLY(z3, - FIX_2_172734803) /* sqrt(2) * (-c1-c5) */
-	 + MULTIPLY(z4, FIX_1_061594337); /* sqrt(2) * (c5+c7) */
-    
-    tmp2 = MULTIPLY(z1, - FIX_0_509795579) /* sqrt(2) * (c7-c5) */
-	 + MULTIPLY(z2, - FIX_0_601344887) /* sqrt(2) * (c5-c1) */
-	 + MULTIPLY(z3, FIX_0_899976223) /* sqrt(2) * (c3-c7) */
-	 + MULTIPLY(z4, FIX_2_562915447); /* sqrt(2) * (c1+c3) */
-
-    /* Final output stage */
-    
-    outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp2,
-					  CONST_BITS+PASS1_BITS+3+1)
-			    & RANGE_MASK];
-    outptr[3] = range_limit[(int) DESCALE(tmp10 - tmp2,
-					  CONST_BITS+PASS1_BITS+3+1)
-			    & RANGE_MASK];
-    outptr[1] = range_limit[(int) DESCALE(tmp12 + tmp0,
-					  CONST_BITS+PASS1_BITS+3+1)
-			    & RANGE_MASK];
-    outptr[2] = range_limit[(int) DESCALE(tmp12 - tmp0,
-					  CONST_BITS+PASS1_BITS+3+1)
-			    & RANGE_MASK];
-    
-    wsptr += DCTSIZE;		/* advance pointer to next row */
-  }
-}
-
-
-/*
- * Perform dequantization and inverse DCT on one block of coefficients,
- * producing a reduced-size 2x2 output block.
- */
-
-GLOBAL(void)
-jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	       JCOEFPTR coef_block,
-	       JSAMPARRAY output_buf, JDIMENSION output_col)
-{
-  INT32 tmp0, tmp10, z1;
-  JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
-  JSAMPROW outptr;
-  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
-  int ctr;
-  int workspace[DCTSIZE*2];	/* buffers data between passes */
-  SHIFT_TEMPS
-
-  /* Pass 1: process columns from input, store into work array. */
-
-  inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
-  wsptr = workspace;
-  for (ctr = DCTSIZE; ctr > 0; inptr++, quantptr++, wsptr++, ctr--) {
-    /* Don't bother to process columns 2,4,6 */
-    if (ctr == DCTSIZE-2 || ctr == DCTSIZE-4 || ctr == DCTSIZE-6)
-      continue;
-    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*3] == 0 &&
-	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*7] == 0) {
-      /* AC terms all zero; we need not examine terms 2,4,6 for 2x2 output */
-      int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
-      
-      wsptr[DCTSIZE*0] = dcval;
-      wsptr[DCTSIZE*1] = dcval;
-      
-      continue;
-    }
-    
-    /* Even part */
-    
-    z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    tmp10 = z1 << (CONST_BITS+2);
-    
-    /* Odd part */
-
-    z1 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
-    tmp0 = MULTIPLY(z1, - FIX_0_720959822); /* sqrt(2) * (c7-c5+c3-c1) */
-    z1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    tmp0 += MULTIPLY(z1, FIX_0_850430095); /* sqrt(2) * (-c1+c3+c5+c7) */
-    z1 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    tmp0 += MULTIPLY(z1, - FIX_1_272758580); /* sqrt(2) * (-c1+c3-c5-c7) */
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    tmp0 += MULTIPLY(z1, FIX_3_624509785); /* sqrt(2) * (c1+c3+c5+c7) */
-
-    /* Final output stage */
-    
-    wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp0, CONST_BITS-PASS1_BITS+2);
-    wsptr[DCTSIZE*1] = (int) DESCALE(tmp10 - tmp0, CONST_BITS-PASS1_BITS+2);
-  }
-  
-  /* Pass 2: process 2 rows from work array, store into output array. */
-
-  wsptr = workspace;
-  for (ctr = 0; ctr < 2; ctr++) {
-    outptr = output_buf[ctr] + output_col;
-    /* It's not clear whether a zero row test is worthwhile here ... */
-
-#ifndef NO_ZERO_ROW_TEST
-    if (wsptr[1] == 0 && wsptr[3] == 0 && wsptr[5] == 0 && wsptr[7] == 0) {
-      /* AC terms all zero */
-      JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3)
-				  & RANGE_MASK];
-      
-      outptr[0] = dcval;
-      outptr[1] = dcval;
-      
-      wsptr += DCTSIZE;		/* advance pointer to next row */
-      continue;
-    }
-#endif
-    
-    /* Even part */
-    
-    tmp10 = ((INT32) wsptr[0]) << (CONST_BITS+2);
-    
-    /* Odd part */
-
-    tmp0 = MULTIPLY((INT32) wsptr[7], - FIX_0_720959822) /* sqrt(2) * (c7-c5+c3-c1) */
-	 + MULTIPLY((INT32) wsptr[5], FIX_0_850430095) /* sqrt(2) * (-c1+c3+c5+c7) */
-	 + MULTIPLY((INT32) wsptr[3], - FIX_1_272758580) /* sqrt(2) * (-c1+c3-c5-c7) */
-	 + MULTIPLY((INT32) wsptr[1], FIX_3_624509785); /* sqrt(2) * (c1+c3+c5+c7) */
-
-    /* Final output stage */
-    
-    outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp0,
-					  CONST_BITS+PASS1_BITS+3+2)
-			    & RANGE_MASK];
-    outptr[1] = range_limit[(int) DESCALE(tmp10 - tmp0,
-					  CONST_BITS+PASS1_BITS+3+2)
-			    & RANGE_MASK];
-    
-    wsptr += DCTSIZE;		/* advance pointer to next row */
-  }
-}
-
-
-/*
- * Perform dequantization and inverse DCT on one block of coefficients,
- * producing a reduced-size 1x1 output block.
- */
-
-GLOBAL(void)
-jpeg_idct_1x1 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	       JCOEFPTR coef_block,
-	       JSAMPARRAY output_buf, JDIMENSION output_col)
-{
-  int dcval;
-  ISLOW_MULT_TYPE * quantptr;
-  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
-  SHIFT_TEMPS
-
-  /* We hardly need an inverse DCT routine for this: just take the
-   * average pixel value, which is one-eighth of the DC coefficient.
-   */
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
-  dcval = DEQUANTIZE(coef_block[0], quantptr[0]);
-  dcval = (int) DESCALE((INT32) dcval, 3);
-
-  output_buf[0][output_col] = range_limit[dcval & RANGE_MASK];
-}
-
-#endif /* IDCT_SCALING_SUPPORTED */
diff --git a/3rdparty/libjpeg/jmemmgr.c b/3rdparty/libjpeg/jmemmgr.c
index 31050bf590..f0e83fb950 100644
--- a/3rdparty/libjpeg/jmemmgr.c
+++ b/3rdparty/libjpeg/jmemmgr.c
@@ -2,6 +2,7 @@
  * jmemmgr.c
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2011 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -36,9 +37,6 @@ extern char * getenv JPP((const char * name));
 #endif
 #endif
 
-#if defined _MSC_VER && _MSC_VER >= 1400
-#pragma warning(disable: 4267)
-#endif
 
 /*
  * Some important notes:
@@ -824,7 +822,7 @@ access_virt_sarray (j_common_ptr cinfo, jvirt_sarray_ptr ptr,
       undef_row -= ptr->cur_start_row; /* make indexes relative to buffer */
       end_row -= ptr->cur_start_row;
       while (undef_row < end_row) {
-	jzero_far((void FAR *) ptr->mem_buffer[undef_row], bytesperrow);
+	FMEMZERO((void FAR *) ptr->mem_buffer[undef_row], bytesperrow);
 	undef_row++;
       }
     } else {
@@ -909,7 +907,7 @@ access_virt_barray (j_common_ptr cinfo, jvirt_barray_ptr ptr,
       undef_row -= ptr->cur_start_row; /* make indexes relative to buffer */
       end_row -= ptr->cur_start_row;
       while (undef_row < end_row) {
-	jzero_far((void FAR *) ptr->mem_buffer[undef_row], bytesperrow);
+	FMEMZERO((void FAR *) ptr->mem_buffer[undef_row], bytesperrow);
 	undef_row++;
       }
     } else {
diff --git a/3rdparty/libjpeg/jmemnobs.c b/3rdparty/libjpeg/jmemnobs.c
new file mode 100644
index 0000000000..eb8c337725
--- /dev/null
+++ b/3rdparty/libjpeg/jmemnobs.c
@@ -0,0 +1,109 @@
+/*
+ * jmemnobs.c
+ *
+ * Copyright (C) 1992-1996, Thomas G. Lane.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file provides a really simple implementation of the system-
+ * dependent portion of the JPEG memory manager.  This implementation
+ * assumes that no backing-store files are needed: all required space
+ * can be obtained from malloc().
+ * This is very portable in the sense that it'll compile on almost anything,
+ * but you'd better have lots of main memory (or virtual memory) if you want
+ * to process big images.
+ * Note that the max_memory_to_use option is ignored by this implementation.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jmemsys.h"		/* import the system-dependent declarations */
+
+#ifndef HAVE_STDLIB_H		/* <stdlib.h> should declare malloc(),free() */
+extern void * malloc JPP((size_t size));
+extern void free JPP((void *ptr));
+#endif
+
+
+/*
+ * Memory allocation and freeing are controlled by the regular library
+ * routines malloc() and free().
+ */
+
+GLOBAL(void *)
+jpeg_get_small (j_common_ptr cinfo, size_t sizeofobject)
+{
+  return (void *) malloc(sizeofobject);
+}
+
+GLOBAL(void)
+jpeg_free_small (j_common_ptr cinfo, void * object, size_t sizeofobject)
+{
+  free(object);
+}
+
+
+/*
+ * "Large" objects are treated the same as "small" ones.
+ * NB: although we include FAR keywords in the routine declarations,
+ * this file won't actually work in 80x86 small/medium model; at least,
+ * you probably won't be able to process useful-size images in only 64KB.
+ */
+
+GLOBAL(void FAR *)
+jpeg_get_large (j_common_ptr cinfo, size_t sizeofobject)
+{
+  return (void FAR *) malloc(sizeofobject);
+}
+
+GLOBAL(void)
+jpeg_free_large (j_common_ptr cinfo, void FAR * object, size_t sizeofobject)
+{
+  free(object);
+}
+
+
+/*
+ * This routine computes the total memory space available for allocation.
+ * Here we always say, "we got all you want bud!"
+ */
+
+GLOBAL(long)
+jpeg_mem_available (j_common_ptr cinfo, long min_bytes_needed,
+		    long max_bytes_needed, long already_allocated)
+{
+  return max_bytes_needed;
+}
+
+
+/*
+ * Backing store (temporary file) management.
+ * Since jpeg_mem_available always promised the moon,
+ * this should never be called and we can just error out.
+ */
+
+GLOBAL(void)
+jpeg_open_backing_store (j_common_ptr cinfo, backing_store_ptr info,
+			 long total_bytes_needed)
+{
+  ERREXIT(cinfo, JERR_NO_BACKING_STORE);
+}
+
+
+/*
+ * These routines take care of any system-dependent initialization and
+ * cleanup required.  Here, there isn't any.
+ */
+
+GLOBAL(long)
+jpeg_mem_init (j_common_ptr cinfo)
+{
+  return 0;			/* just set max_memory_to_use to 0 */
+}
+
+GLOBAL(void)
+jpeg_mem_term (j_common_ptr cinfo)
+{
+  /* no work */
+}
diff --git a/3rdparty/libjpeg/jmorecfg.h b/3rdparty/libjpeg/jmorecfg.h
index 54a7d1c447..6c085c36a6 100644
--- a/3rdparty/libjpeg/jmorecfg.h
+++ b/3rdparty/libjpeg/jmorecfg.h
@@ -2,6 +2,7 @@
  * jmorecfg.h
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 1997-2011 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -158,8 +159,14 @@ typedef short INT16;
 /* INT32 must hold at least signed 32-bit values. */
 
 #ifndef XMD_H			/* X11/xmd.h correctly defines INT32 */
+#ifndef _BASETSD_H_		/* Microsoft defines it in basetsd.h */
+#ifndef _BASETSD_H		/* MinGW is slightly different */
+#ifndef QGLOBAL_H		/* Qt defines it in qglobal.h */
 typedef long INT32;
 #endif
+#endif
+#endif
+#endif
 
 /* Datatype used for image dimensions.  The JPEG standard only supports
  * images up to 64K*64K due to 16-bit fields in SOF markers.  Therefore
@@ -209,11 +216,13 @@ typedef unsigned int JDIMENSION;
  * explicit coding is needed; see uses of the NEED_FAR_POINTERS symbol.
  */
 
+#ifndef FAR
 #ifdef NEED_FAR_POINTERS
 #define FAR  far
 #else
 #define FAR
 #endif
+#endif
 
 
 /*
@@ -256,8 +265,6 @@ typedef int boolean;
  * (You may HAVE to do that if your compiler doesn't like null source files.)
  */
 
-/* Arithmetic coding is unsupported for legal reasons.  Complaints to IBM. */
-
 /* Capability options common to encoder and decoder: */
 
 #define DCT_ISLOW_SUPPORTED	/* slow but accurate integer algorithm */
@@ -266,9 +273,10 @@ typedef int boolean;
 
 /* Encoder capability options: */
 
-#undef  C_ARITH_CODING_SUPPORTED    /* Arithmetic coding back end? */
+#define C_ARITH_CODING_SUPPORTED    /* Arithmetic coding back end? */
 #define C_MULTISCAN_FILES_SUPPORTED /* Multiple-scan JPEG files? */
 #define C_PROGRESSIVE_SUPPORTED	    /* Progressive JPEG? (Requires MULTISCAN)*/
+#define DCT_SCALING_SUPPORTED	    /* Input rescaling via DCT? (Requires DCT_ISLOW)*/
 #define ENTROPY_OPT_SUPPORTED	    /* Optimization of entropy coding parms? */
 /* Note: if you selected 12-bit data precision, it is dangerous to turn off
  * ENTROPY_OPT_SUPPORTED.  The standard Huffman tables are only good for 8-bit
@@ -282,12 +290,12 @@ typedef int boolean;
 
 /* Decoder capability options: */
 
-#undef  D_ARITH_CODING_SUPPORTED    /* Arithmetic coding back end? */
+#define D_ARITH_CODING_SUPPORTED    /* Arithmetic coding back end? */
 #define D_MULTISCAN_FILES_SUPPORTED /* Multiple-scan JPEG files? */
 #define D_PROGRESSIVE_SUPPORTED	    /* Progressive JPEG? (Requires MULTISCAN)*/
+#define IDCT_SCALING_SUPPORTED	    /* Output rescaling via IDCT? */
 #define SAVE_MARKERS_SUPPORTED	    /* jpeg_save_markers() needed? */
 #define BLOCK_SMOOTHING_SUPPORTED   /* Block smoothing? (Progressive only) */
-#define IDCT_SCALING_SUPPORTED	    /* Output rescaling via IDCT? */
 #undef  UPSAMPLE_SCALING_SUPPORTED  /* Output rescaling at upsample stage? */
 #define UPSAMPLE_MERGING_SUPPORTED  /* Fast path for sloppy upsampling? */
 #define QUANT_1PASS_SUPPORTED	    /* 1-pass color quantization? */
@@ -304,9 +312,7 @@ typedef int boolean;
  * the offsets will also change the order in which colormap data is organized.
  * RESTRICTIONS:
  * 1. The sample applications cjpeg,djpeg do NOT support modified RGB formats.
- * 2. These macros only affect RGB<=>YCbCr color conversion, so they are not
- *    useful if you are using JPEG color spaces other than YCbCr or grayscale.
- * 3. The color quantizer modules will not behave desirably if RGB_PIXELSIZE
+ * 2. The color quantizer modules will not behave desirably if RGB_PIXELSIZE
  *    is not 3 (they don't understand about dummy color components!).  So you
  *    can't use color quantization if you change that value.
  */
diff --git a/3rdparty/libjpeg/jpegint.h b/3rdparty/libjpeg/jpegint.h
index 95b00d405c..c0d5c14202 100644
--- a/3rdparty/libjpeg/jpegint.h
+++ b/3rdparty/libjpeg/jpegint.h
@@ -2,6 +2,7 @@
  * jpegint.h
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 1997-2011 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -99,14 +100,16 @@ struct jpeg_downsampler {
 };
 
 /* Forward DCT (also controls coefficient quantization) */
+typedef JMETHOD(void, forward_DCT_ptr,
+		(j_compress_ptr cinfo, jpeg_component_info * compptr,
+		 JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
+		 JDIMENSION start_row, JDIMENSION start_col,
+		 JDIMENSION num_blocks));
+
 struct jpeg_forward_dct {
   JMETHOD(void, start_pass, (j_compress_ptr cinfo));
-  /* perhaps this should be an array??? */
-  JMETHOD(void, forward_DCT, (j_compress_ptr cinfo,
-			      jpeg_component_info * compptr,
-			      JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
-			      JDIMENSION start_row, JDIMENSION start_col,
-			      JDIMENSION num_blocks));
+  /* It is useful to allow each component to have a separate FDCT method. */
+  forward_DCT_ptr forward_DCT[MAX_COMPONENTS];
 };
 
 /* Entropy encoding */
@@ -210,10 +213,6 @@ struct jpeg_entropy_decoder {
   JMETHOD(void, start_pass, (j_decompress_ptr cinfo));
   JMETHOD(boolean, decode_mcu, (j_decompress_ptr cinfo,
 				JBLOCKROW *MCU_data));
-
-  /* This is here to share code between baseline and progressive decoders; */
-  /* other modules probably should not use it */
-  boolean insufficient_data;	/* set TRUE after emitting warning */
 };
 
 /* Inverse DCT (also performs dequantization) */
@@ -303,7 +302,7 @@ struct jpeg_color_quantizer {
 #define jinit_downsampler	jIDownsampler
 #define jinit_forward_dct	jIFDCT
 #define jinit_huff_encoder	jIHEncoder
-#define jinit_phuff_encoder	jIPHEncoder
+#define jinit_arith_encoder	jIAEncoder
 #define jinit_marker_writer	jIMWriter
 #define jinit_master_decompress	jIDMaster
 #define jinit_d_main_controller	jIDMainC
@@ -312,7 +311,7 @@ struct jpeg_color_quantizer {
 #define jinit_input_controller	jIInCtlr
 #define jinit_marker_reader	jIMReader
 #define jinit_huff_decoder	jIHDecoder
-#define jinit_phuff_decoder	jIPHDecoder
+#define jinit_arith_decoder	jIADecoder
 #define jinit_inverse_dct	jIIDCT
 #define jinit_upsampler		jIUpsampler
 #define jinit_color_deconverter	jIDColor
@@ -322,14 +321,41 @@ struct jpeg_color_quantizer {
 #define jinit_memory_mgr	jIMemMgr
 #define jdiv_round_up		jDivRound
 #define jround_up		jRound
+#define jzero_far		jZeroFar
 #define jcopy_sample_rows	jCopySamples
 #define jcopy_block_row		jCopyBlocks
-#define jzero_far		jZeroFar
 #define jpeg_zigzag_order	jZIGTable
 #define jpeg_natural_order	jZAGTable
+#define jpeg_natural_order7	jZAG7Table
+#define jpeg_natural_order6	jZAG6Table
+#define jpeg_natural_order5	jZAG5Table
+#define jpeg_natural_order4	jZAG4Table
+#define jpeg_natural_order3	jZAG3Table
+#define jpeg_natural_order2	jZAG2Table
+#define jpeg_aritab		jAriTab
 #endif /* NEED_SHORT_EXTERNAL_NAMES */
 
 
+/* On normal machines we can apply MEMCOPY() and MEMZERO() to sample arrays
+ * and coefficient-block arrays.  This won't work on 80x86 because the arrays
+ * are FAR and we're assuming a small-pointer memory model.  However, some
+ * DOS compilers provide far-pointer versions of memcpy() and memset() even
+ * in the small-model libraries.  These will be used if USE_FMEM is defined.
+ * Otherwise, the routines in jutils.c do it the hard way.
+ */
+
+#ifndef NEED_FAR_POINTERS	/* normal case, same as regular macro */
+#define FMEMZERO(target,size)	MEMZERO(target,size)
+#else				/* 80x86 case */
+#ifdef USE_FMEM
+#define FMEMZERO(target,size)	_fmemset((void FAR *)(target), 0, (size_t)(size))
+#else
+EXTERN(void) jzero_far JPP((void FAR * target, size_t bytestozero));
+#define FMEMZERO(target,size)	jzero_far(target, size)
+#endif
+#endif
+
+
 /* Compression module initialization routines */
 EXTERN(void) jinit_compress_master JPP((j_compress_ptr cinfo));
 EXTERN(void) jinit_c_master_control JPP((j_compress_ptr cinfo,
@@ -344,7 +370,7 @@ EXTERN(void) jinit_color_converter JPP((j_compress_ptr cinfo));
 EXTERN(void) jinit_downsampler JPP((j_compress_ptr cinfo));
 EXTERN(void) jinit_forward_dct JPP((j_compress_ptr cinfo));
 EXTERN(void) jinit_huff_encoder JPP((j_compress_ptr cinfo));
-EXTERN(void) jinit_phuff_encoder JPP((j_compress_ptr cinfo));
+EXTERN(void) jinit_arith_encoder JPP((j_compress_ptr cinfo));
 EXTERN(void) jinit_marker_writer JPP((j_compress_ptr cinfo));
 /* Decompression module initialization routines */
 EXTERN(void) jinit_master_decompress JPP((j_decompress_ptr cinfo));
@@ -357,7 +383,7 @@ EXTERN(void) jinit_d_post_controller JPP((j_decompress_ptr cinfo,
 EXTERN(void) jinit_input_controller JPP((j_decompress_ptr cinfo));
 EXTERN(void) jinit_marker_reader JPP((j_decompress_ptr cinfo));
 EXTERN(void) jinit_huff_decoder JPP((j_decompress_ptr cinfo));
-EXTERN(void) jinit_phuff_decoder JPP((j_decompress_ptr cinfo));
+EXTERN(void) jinit_arith_decoder JPP((j_decompress_ptr cinfo));
 EXTERN(void) jinit_inverse_dct JPP((j_decompress_ptr cinfo));
 EXTERN(void) jinit_upsampler JPP((j_decompress_ptr cinfo));
 EXTERN(void) jinit_color_deconverter JPP((j_decompress_ptr cinfo));
@@ -375,12 +401,20 @@ EXTERN(void) jcopy_sample_rows JPP((JSAMPARRAY input_array, int source_row,
 				    int num_rows, JDIMENSION num_cols));
 EXTERN(void) jcopy_block_row JPP((JBLOCKROW input_row, JBLOCKROW output_row,
 				  JDIMENSION num_blocks));
-EXTERN(void) jzero_far JPP((void FAR * target, size_t bytestozero));
 /* Constant tables in jutils.c */
 #if 0				/* This table is not actually needed in v6a */
 extern const int jpeg_zigzag_order[]; /* natural coef order to zigzag order */
 #endif
 extern const int jpeg_natural_order[]; /* zigzag coef order to natural order */
+extern const int jpeg_natural_order7[]; /* zz to natural order for 7x7 block */
+extern const int jpeg_natural_order6[]; /* zz to natural order for 6x6 block */
+extern const int jpeg_natural_order5[]; /* zz to natural order for 5x5 block */
+extern const int jpeg_natural_order4[]; /* zz to natural order for 4x4 block */
+extern const int jpeg_natural_order3[]; /* zz to natural order for 3x3 block */
+extern const int jpeg_natural_order2[]; /* zz to natural order for 2x2 block */
+
+/* Arithmetic coding probability estimation tables in jaricom.c */
+extern const INT32 jpeg_aritab[];
 
 /* Suppress undefined-structure complaints if necessary. */
 
diff --git a/3rdparty/libjpeg/jpeglib.h b/3rdparty/libjpeg/jpeglib.h
index b12cfd55e7..1327cffa96 100644
--- a/3rdparty/libjpeg/jpeglib.h
+++ b/3rdparty/libjpeg/jpeglib.h
@@ -2,6 +2,7 @@
  * jpeglib.h
  *
  * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Modified 2002-2011 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -13,17 +14,6 @@
 #ifndef JPEGLIB_H
 #define JPEGLIB_H
 
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* HJH modification: protect compiler options for structure alignment and enum
- * size if the compiler is Borland C++ */
-#ifdef __BORLANDC__
-#pragma option push -b
-#pragma option push -a4
-#endif
-
 /*
  * First we include the configuration files that record how this
  * installation of the JPEG library is set up.  jconfig.h can be
@@ -37,11 +27,19 @@ extern "C" {
 #include "jmorecfg.h"		/* seldom changed options */
 
 
-/* Version ID for the JPEG library.
- * Might be useful for tests like "#if JPEG_LIB_VERSION >= 60".
+#ifdef __cplusplus
+#ifndef DONT_USE_EXTERN_C
+extern "C" {
+#endif
+#endif
+
+/* Version IDs for the JPEG library.
+ * Might be useful for tests like "#if JPEG_LIB_VERSION >= 80".
  */
 
-#define JPEG_LIB_VERSION  62	/* Version 6b */
+#define JPEG_LIB_VERSION        80	/* Compatibility version 8.0 */
+#define JPEG_LIB_VERSION_MAJOR  8
+#define JPEG_LIB_VERSION_MINOR  4
 
 
 /* Various constants determining the sizes of things.
@@ -49,7 +47,7 @@ extern "C" {
  * if you want to be compatible.
  */
 
-#define DCTSIZE		    8	/* The basic DCT block is 8x8 samples */
+#define DCTSIZE		    8	/* The basic DCT block is 8x8 coefficients */
 #define DCTSIZE2	    64	/* DCTSIZE squared; # of elements in a block */
 #define NUM_QUANT_TBLS      4	/* Quantization tables are numbered 0..3 */
 #define NUM_HUFF_TBLS       4	/* Huffman tables are numbered 0..3 */
@@ -149,18 +147,18 @@ typedef struct {
    */
   JDIMENSION width_in_blocks;
   JDIMENSION height_in_blocks;
-  /* Size of a DCT block in samples.  Always DCTSIZE for compression.
-   * For decompression this is the size of the output from one DCT block,
-   * reflecting any scaling we choose to apply during the IDCT step.
-   * Values of 1,2,4,8 are likely to be supported.  Note that different
-   * components may receive different IDCT scalings.
+  /* Size of a DCT block in samples,
+   * reflecting any scaling we choose to apply during the DCT step.
+   * Values from 1 to 16 are supported.
+   * Note that different components may receive different DCT scalings.
    */
-  int DCT_scaled_size;
+  int DCT_h_scaled_size;
+  int DCT_v_scaled_size;
   /* The downsampled dimensions are the component's actual, unpadded number
-   * of samples at the main buffer (preprocessing/compression interface), thus
-   * downsampled_width = ceil(image_width * Hi/Hmax)
-   * and similarly for height.  For decompression, IDCT scaling is included, so
-   * downsampled_width = ceil(image_width * Hi/Hmax * DCT_scaled_size/DCTSIZE)
+   * of samples at the main buffer (preprocessing/compression interface);
+   * DCT scaling is included, so
+   * downsampled_width = ceil(image_width * Hi/Hmax * DCT_h_scaled_size/DCTSIZE)
+   * and similarly for height.
    */
   JDIMENSION downsampled_width;	 /* actual width in samples */
   JDIMENSION downsampled_height; /* actual height in samples */
@@ -175,7 +173,7 @@ typedef struct {
   int MCU_width;		/* number of blocks per MCU, horizontally */
   int MCU_height;		/* number of blocks per MCU, vertically */
   int MCU_blocks;		/* MCU_width * MCU_height */
-  int MCU_sample_width;		/* MCU width in samples, MCU_width*DCT_scaled_size */
+  int MCU_sample_width;	/* MCU width in samples: MCU_width * DCT_h_scaled_size */
   int last_col_width;		/* # of non-dummy blocks across in last MCU */
   int last_row_height;		/* # of non-dummy blocks down in last MCU */
 
@@ -302,6 +300,17 @@ struct jpeg_compress_struct {
    * helper routines to simplify changing parameters.
    */
 
+  unsigned int scale_num, scale_denom; /* fraction by which to scale image */
+
+  JDIMENSION jpeg_width;	/* scaled JPEG image width */
+  JDIMENSION jpeg_height;	/* scaled JPEG image height */
+  /* Dimensions of actual JPEG image that will be written to file,
+   * derived from input dimensions by scaling factors above.
+   * These fields are computed by jpeg_start_compress().
+   * You can also use jpeg_calc_jpeg_dimensions() to determine these values
+   * in advance of calling jpeg_start_compress().
+   */
+
   int data_precision;		/* bits of precision in image data */
 
   int num_components;		/* # of color components in JPEG image */
@@ -309,14 +318,17 @@ struct jpeg_compress_struct {
 
   jpeg_component_info * comp_info;
   /* comp_info[i] describes component that appears i'th in SOF */
-  
+
   JQUANT_TBL * quant_tbl_ptrs[NUM_QUANT_TBLS];
-  /* ptrs to coefficient quantization tables, or NULL if not defined */
-  
+  int q_scale_factor[NUM_QUANT_TBLS];
+  /* ptrs to coefficient quantization tables, or NULL if not defined,
+   * and corresponding scale factors (percentage, initialized 100).
+   */
+
   JHUFF_TBL * dc_huff_tbl_ptrs[NUM_HUFF_TBLS];
   JHUFF_TBL * ac_huff_tbl_ptrs[NUM_HUFF_TBLS];
   /* ptrs to Huffman coding tables, or NULL if not defined */
-  
+
   UINT8 arith_dc_L[NUM_ARITH_TBLS]; /* L values for DC arith-coding tables */
   UINT8 arith_dc_U[NUM_ARITH_TBLS]; /* U values for DC arith-coding tables */
   UINT8 arith_ac_K[NUM_ARITH_TBLS]; /* Kx values for AC arith-coding tables */
@@ -332,6 +344,7 @@ struct jpeg_compress_struct {
   boolean arith_code;		/* TRUE=arithmetic coding, FALSE=Huffman */
   boolean optimize_coding;	/* TRUE=optimize entropy encoding parms */
   boolean CCIR601_sampling;	/* TRUE=first samples are cosited */
+  boolean do_fancy_downsampling; /* TRUE=apply fancy downsampling */
   int smoothing_factor;		/* 1..100, or 0 for no input smoothing */
   J_DCT_METHOD dct_method;	/* DCT algorithm selector */
 
@@ -375,6 +388,9 @@ struct jpeg_compress_struct {
   int max_h_samp_factor;	/* largest h_samp_factor */
   int max_v_samp_factor;	/* largest v_samp_factor */
 
+  int min_DCT_h_scaled_size;	/* smallest DCT_h_scaled_size of any component */
+  int min_DCT_v_scaled_size;	/* smallest DCT_v_scaled_size of any component */
+
   JDIMENSION total_iMCU_rows;	/* # of iMCU rows to be input to coef ctlr */
   /* The coefficient controller receives data in units of MCU rows as defined
    * for fully interleaved scans (whether the JPEG file is interleaved or not).
@@ -400,6 +416,10 @@ struct jpeg_compress_struct {
 
   int Ss, Se, Ah, Al;		/* progressive JPEG parameters for scan */
 
+  int block_size;		/* the basic DCT block size: 1..16 */
+  const int * natural_order;	/* natural-order position array */
+  int lim_Se;			/* min( Se, DCTSIZE2-1 ) */
+
   /*
    * Links to compression subobjects (methods and private variables of modules)
    */
@@ -546,6 +566,7 @@ struct jpeg_decompress_struct {
   jpeg_component_info * comp_info;
   /* comp_info[i] describes component that appears i'th in SOF */
 
+  boolean is_baseline;		/* TRUE if Baseline SOF0 encountered */
   boolean progressive_mode;	/* TRUE if SOFn specifies progressive mode */
   boolean arith_code;		/* TRUE=arithmetic coding, FALSE=Huffman */
 
@@ -586,7 +607,8 @@ struct jpeg_decompress_struct {
   int max_h_samp_factor;	/* largest h_samp_factor */
   int max_v_samp_factor;	/* largest v_samp_factor */
 
-  int min_DCT_scaled_size;	/* smallest DCT_scaled_size of any component */
+  int min_DCT_h_scaled_size;	/* smallest DCT_h_scaled_size of any component */
+  int min_DCT_v_scaled_size;	/* smallest DCT_v_scaled_size of any component */
 
   JDIMENSION total_iMCU_rows;	/* # of iMCU rows in image */
   /* The coefficient controller's input and output progress is measured in
@@ -594,7 +616,7 @@ struct jpeg_decompress_struct {
    * in fully interleaved JPEG scans, but are used whether the scan is
    * interleaved or not.  We define an iMCU row as v_samp_factor DCT block
    * rows of each component.  Therefore, the IDCT output contains
-   * v_samp_factor*DCT_scaled_size sample rows of a component per iMCU row.
+   * v_samp_factor*DCT_v_scaled_size sample rows of a component per iMCU row.
    */
 
   JSAMPLE * sample_range_limit; /* table for fast range-limiting */
@@ -618,6 +640,12 @@ struct jpeg_decompress_struct {
 
   int Ss, Se, Ah, Al;		/* progressive JPEG parameters for scan */
 
+  /* These fields are derived from Se of first SOS marker.
+   */
+  int block_size;		/* the basic DCT block size: 1..16 */
+  const int * natural_order; /* natural-order position array for entropy decode */
+  int lim_Se;			/* min( Se, DCTSIZE2-1 ) for entropy decode */
+
   /* This field is shared between entropy decoder and marker parser.
    * It is either zero or the code of a JPEG marker that has been
    * read from the data source, but has not yet been processed.
@@ -847,11 +875,14 @@ typedef JMETHOD(boolean, jpeg_marker_parser_method, (j_decompress_ptr cinfo));
 #define jpeg_destroy_decompress	jDestDecompress
 #define jpeg_stdio_dest		jStdDest
 #define jpeg_stdio_src		jStdSrc
+#define jpeg_mem_dest		jMemDest
+#define jpeg_mem_src		jMemSrc
 #define jpeg_set_defaults	jSetDefaults
 #define jpeg_set_colorspace	jSetColorspace
 #define jpeg_default_colorspace	jDefColorspace
 #define jpeg_set_quality	jSetQuality
 #define jpeg_set_linear_quality	jSetLQuality
+#define jpeg_default_qtables	jDefQTables
 #define jpeg_add_quant_table	jAddQuantTable
 #define jpeg_quality_scaling	jQualityScaling
 #define jpeg_simple_progression	jSimProgress
@@ -861,6 +892,7 @@ typedef JMETHOD(boolean, jpeg_marker_parser_method, (j_decompress_ptr cinfo));
 #define jpeg_start_compress	jStrtCompress
 #define jpeg_write_scanlines	jWrtScanlines
 #define jpeg_finish_compress	jFinCompress
+#define jpeg_calc_jpeg_dimensions	jCjpegDimensions
 #define jpeg_write_raw_data	jWrtRawData
 #define jpeg_write_marker	jWrtMarker
 #define jpeg_write_m_header	jWrtMHeader
@@ -877,6 +909,7 @@ typedef JMETHOD(boolean, jpeg_marker_parser_method, (j_decompress_ptr cinfo));
 #define jpeg_input_complete	jInComplete
 #define jpeg_new_colormap	jNewCMap
 #define jpeg_consume_input	jConsumeInput
+#define jpeg_core_output_dimensions	jCoreDimensions
 #define jpeg_calc_output_dimensions	jCalcDimensions
 #define jpeg_save_markers	jSaveMarkers
 #define jpeg_set_marker_processor	jSetMarker
@@ -921,6 +954,14 @@ EXTERN(void) jpeg_destroy_decompress JPP((j_decompress_ptr cinfo));
 EXTERN(void) jpeg_stdio_dest JPP((j_compress_ptr cinfo, FILE * outfile));
 EXTERN(void) jpeg_stdio_src JPP((j_decompress_ptr cinfo, FILE * infile));
 
+/* Data source and destination managers: memory buffers. */
+EXTERN(void) jpeg_mem_dest JPP((j_compress_ptr cinfo,
+			       unsigned char ** outbuffer,
+			       unsigned long * outsize));
+EXTERN(void) jpeg_mem_src JPP((j_decompress_ptr cinfo,
+			      unsigned char * inbuffer,
+			      unsigned long insize));
+
 /* Default parameter setup for compression */
 EXTERN(void) jpeg_set_defaults JPP((j_compress_ptr cinfo));
 /* Compression parameter setup aids */
@@ -932,6 +973,8 @@ EXTERN(void) jpeg_set_quality JPP((j_compress_ptr cinfo, int quality,
 EXTERN(void) jpeg_set_linear_quality JPP((j_compress_ptr cinfo,
 					  int scale_factor,
 					  boolean force_baseline));
+EXTERN(void) jpeg_default_qtables JPP((j_compress_ptr cinfo,
+				       boolean force_baseline));
 EXTERN(void) jpeg_add_quant_table JPP((j_compress_ptr cinfo, int which_tbl,
 				       const unsigned int *basic_table,
 				       int scale_factor,
@@ -951,12 +994,15 @@ EXTERN(JDIMENSION) jpeg_write_scanlines JPP((j_compress_ptr cinfo,
 					     JDIMENSION num_lines));
 EXTERN(void) jpeg_finish_compress JPP((j_compress_ptr cinfo));
 
+/* Precalculate JPEG dimensions for current compression parameters. */
+EXTERN(void) jpeg_calc_jpeg_dimensions JPP((j_compress_ptr cinfo));
+
 /* Replaces jpeg_write_scanlines when writing raw downsampled data. */
 EXTERN(JDIMENSION) jpeg_write_raw_data JPP((j_compress_ptr cinfo,
 					    JSAMPIMAGE data,
 					    JDIMENSION num_lines));
 
-/* Write a special marker.  See libjpeg.doc concerning safe usage. */
+/* Write a special marker.  See libjpeg.txt concerning safe usage. */
 EXTERN(void) jpeg_write_marker
 	JPP((j_compress_ptr cinfo, int marker,
 	     const JOCTET * dataptr, unsigned int datalen));
@@ -1010,6 +1056,7 @@ EXTERN(int) jpeg_consume_input JPP((j_decompress_ptr cinfo));
 #define JPEG_SCAN_COMPLETED	4 /* Completed last iMCU row of a scan */
 
 /* Precalculate output dimensions for current decompression parameters. */
+EXTERN(void) jpeg_core_output_dimensions JPP((j_decompress_ptr cinfo));
 EXTERN(void) jpeg_calc_output_dimensions JPP((j_decompress_ptr cinfo));
 
 /* Control saving of COM and APPn markers into marker_list. */
@@ -1104,13 +1151,10 @@ struct jpeg_color_quantizer { long dummy; };
 #include "jerror.h"		/* fetch error codes too */
 #endif
 
-#ifdef __BORLANDC__
-#pragma option pop /* pop -a switch */
-#pragma option pop /* pop -b */
-#endif
-
 #ifdef __cplusplus
+#ifndef DONT_USE_EXTERN_C
 }
 #endif
+#endif
 
 #endif /* JPEGLIB_H */
diff --git a/3rdparty/libjpeg/jquant1.c b/3rdparty/libjpeg/jquant1.c
index b2f96aa15d..9d11f70669 100644
--- a/3rdparty/libjpeg/jquant1.c
+++ b/3rdparty/libjpeg/jquant1.c
@@ -2,6 +2,7 @@
  * jquant1.c
  *
  * Copyright (C) 1991-1996, Thomas G. Lane.
+ * Modified 2011 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -530,8 +531,8 @@ quantize_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
 
   for (row = 0; row < num_rows; row++) {
     /* Initialize output values to 0 so can process components separately */
-    jzero_far((void FAR *) output_buf[row],
-	      (size_t) (width * SIZEOF(JSAMPLE)));
+    FMEMZERO((void FAR *) output_buf[row],
+	     (size_t) (width * SIZEOF(JSAMPLE)));
     row_index = cquantize->row_index;
     for (ci = 0; ci < nc; ci++) {
       input_ptr = input_buf[row] + ci;
@@ -635,8 +636,8 @@ quantize_fs_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
 
   for (row = 0; row < num_rows; row++) {
     /* Initialize output values to 0 so can process components separately */
-    jzero_far((void FAR *) output_buf[row],
-	      (size_t) (width * SIZEOF(JSAMPLE)));
+    FMEMZERO((void FAR *) output_buf[row],
+	     (size_t) (width * SIZEOF(JSAMPLE)));
     for (ci = 0; ci < nc; ci++) {
       input_ptr = input_buf[row] + ci;
       output_ptr = output_buf[row];
@@ -781,7 +782,7 @@ start_pass_1_quant (j_decompress_ptr cinfo, boolean is_pre_scan)
     /* Initialize the propagated errors to zero. */
     arraysize = (size_t) ((cinfo->output_width + 2) * SIZEOF(FSERROR));
     for (i = 0; i < cinfo->out_color_components; i++)
-      jzero_far((void FAR *) cquantize->fserrors[i], arraysize);
+      FMEMZERO((void FAR *) cquantize->fserrors[i], arraysize);
     break;
   default:
     ERREXIT(cinfo, JERR_NOT_COMPILED);
diff --git a/3rdparty/libjpeg/jquant2.c b/3rdparty/libjpeg/jquant2.c
index af601e334b..38fc2af7a5 100644
--- a/3rdparty/libjpeg/jquant2.c
+++ b/3rdparty/libjpeg/jquant2.c
@@ -2,6 +2,7 @@
  * jquant2.c
  *
  * Copyright (C) 1991-1996, Thomas G. Lane.
+ * Modified 2011 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -1203,7 +1204,7 @@ start_pass_2_quant (j_decompress_ptr cinfo, boolean is_pre_scan)
 	cquantize->fserrors = (FSERRPTR) (*cinfo->mem->alloc_large)
 	  ((j_common_ptr) cinfo, JPOOL_IMAGE, arraysize);
       /* Initialize the propagated errors to zero. */
-      jzero_far((void FAR *) cquantize->fserrors, arraysize);
+      FMEMZERO((void FAR *) cquantize->fserrors, arraysize);
       /* Make the error-limit table if we didn't already. */
       if (cquantize->error_limiter == NULL)
 	init_error_limit(cinfo);
@@ -1214,8 +1215,8 @@ start_pass_2_quant (j_decompress_ptr cinfo, boolean is_pre_scan)
   /* Zero the histogram or inverse color map, if necessary */
   if (cquantize->needs_zeroed) {
     for (i = 0; i < HIST_C0_ELEMS; i++) {
-      jzero_far((void FAR *) histogram[i],
-		HIST_C1_ELEMS*HIST_C2_ELEMS * SIZEOF(histcell));
+      FMEMZERO((void FAR *) histogram[i],
+	       HIST_C1_ELEMS*HIST_C2_ELEMS * SIZEOF(histcell));
     }
     cquantize->needs_zeroed = FALSE;
   }
diff --git a/3rdparty/libjpeg/jutils.c b/3rdparty/libjpeg/jutils.c
index d18a955562..5b16b6d03c 100644
--- a/3rdparty/libjpeg/jutils.c
+++ b/3rdparty/libjpeg/jutils.c
@@ -2,6 +2,7 @@
  * jutils.c
  *
  * Copyright (C) 1991-1996, Thomas G. Lane.
+ * Modified 2009-2011 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -63,6 +64,57 @@ const int jpeg_natural_order[DCTSIZE2+16] = {
  63, 63, 63, 63, 63, 63, 63, 63
 };
 
+const int jpeg_natural_order7[7*7+16] = {
+  0,  1,  8, 16,  9,  2,  3, 10,
+ 17, 24, 32, 25, 18, 11,  4,  5,
+ 12, 19, 26, 33, 40, 48, 41, 34,
+ 27, 20, 13,  6, 14, 21, 28, 35,
+ 42, 49, 50, 43, 36, 29, 22, 30,
+ 37, 44, 51, 52, 45, 38, 46, 53,
+ 54,
+ 63, 63, 63, 63, 63, 63, 63, 63, /* extra entries for safety in decoder */
+ 63, 63, 63, 63, 63, 63, 63, 63
+};
+
+const int jpeg_natural_order6[6*6+16] = {
+  0,  1,  8, 16,  9,  2,  3, 10,
+ 17, 24, 32, 25, 18, 11,  4,  5,
+ 12, 19, 26, 33, 40, 41, 34, 27,
+ 20, 13, 21, 28, 35, 42, 43, 36,
+ 29, 37, 44, 45,
+ 63, 63, 63, 63, 63, 63, 63, 63, /* extra entries for safety in decoder */
+ 63, 63, 63, 63, 63, 63, 63, 63
+};
+
+const int jpeg_natural_order5[5*5+16] = {
+  0,  1,  8, 16,  9,  2,  3, 10,
+ 17, 24, 32, 25, 18, 11,  4, 12,
+ 19, 26, 33, 34, 27, 20, 28, 35,
+ 36,
+ 63, 63, 63, 63, 63, 63, 63, 63, /* extra entries for safety in decoder */
+ 63, 63, 63, 63, 63, 63, 63, 63
+};
+
+const int jpeg_natural_order4[4*4+16] = {
+  0,  1,  8, 16,  9,  2,  3, 10,
+ 17, 24, 25, 18, 11, 19, 26, 27,
+ 63, 63, 63, 63, 63, 63, 63, 63, /* extra entries for safety in decoder */
+ 63, 63, 63, 63, 63, 63, 63, 63
+};
+
+const int jpeg_natural_order3[3*3+16] = {
+  0,  1,  8, 16,  9,  2, 10, 17,
+ 18,
+ 63, 63, 63, 63, 63, 63, 63, 63, /* extra entries for safety in decoder */
+ 63, 63, 63, 63, 63, 63, 63, 63
+};
+
+const int jpeg_natural_order2[2*2+16] = {
+  0,  1,  8,  9,
+ 63, 63, 63, 63, 63, 63, 63, 63, /* extra entries for safety in decoder */
+ 63, 63, 63, 63, 63, 63, 63, 63
+};
+
 
 /*
  * Arithmetic utilities
@@ -96,13 +148,27 @@ jround_up (long a, long b)
  * is not all that great, because these routines aren't very heavily used.)
  */
 
-#ifndef NEED_FAR_POINTERS	/* normal case, same as regular macros */
+#ifndef NEED_FAR_POINTERS	/* normal case, same as regular macro */
 #define FMEMCOPY(dest,src,size)	MEMCOPY(dest,src,size)
-#define FMEMZERO(target,size)	MEMZERO(target,size)
 #else				/* 80x86 case, define if we can */
 #ifdef USE_FMEM
 #define FMEMCOPY(dest,src,size)	_fmemcpy((void FAR *)(dest), (const void FAR *)(src), (size_t)(size))
-#define FMEMZERO(target,size)	_fmemset((void FAR *)(target), 0, (size_t)(size))
+#else
+/* This function is for use by the FMEMZERO macro defined in jpegint.h.
+ * Do not call this function directly, use the FMEMZERO macro instead.
+ */
+GLOBAL(void)
+jzero_far (void FAR * target, size_t bytestozero)
+/* Zero out a chunk of FAR memory. */
+/* This might be sample-array data, block-array data, or alloc_large data. */
+{
+  register char FAR * ptr = (char FAR *) target;
+  register size_t count;
+
+  for (count = bytestozero; count > 0; count--) {
+    *ptr++ = 0;
+  }
+}
 #endif
 #endif
 
@@ -159,21 +225,3 @@ jcopy_block_row (JBLOCKROW input_row, JBLOCKROW output_row,
   }
 #endif
 }
-
-
-GLOBAL(void)
-jzero_far (void FAR * target, size_t bytestozero)
-/* Zero out a chunk of FAR memory. */
-/* This might be sample-array data, block-array data, or alloc_large data. */
-{
-#ifdef FMEMZERO
-  FMEMZERO(target, bytestozero);
-#else
-  register char FAR * ptr = (char FAR *) target;
-  register size_t count;
-
-  for (count = bytestozero; count > 0; count--) {
-    *ptr++ = 0;
-  }
-#endif
-}
diff --git a/3rdparty/libjpeg/jversion.h b/3rdparty/libjpeg/jversion.h
index 6472c58d35..5d4915103e 100644
--- a/3rdparty/libjpeg/jversion.h
+++ b/3rdparty/libjpeg/jversion.h
@@ -1,7 +1,7 @@
 /*
  * jversion.h
  *
- * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Copyright (C) 1991-2012, Thomas G. Lane, Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -9,6 +9,6 @@
  */
 
 
-#define JVERSION	"6b  27-Mar-1998"
+#define JVERSION	"8d  15-Jan-2012"
 
-#define JCOPYRIGHT	"Copyright (C) 1998, Thomas G. Lane"
+#define JCOPYRIGHT	"Copyright (C) 2012, Thomas G. Lane, Guido Vollbeding"
diff --git a/3rdparty/libjpeg/transupp.c b/3rdparty/libjpeg/transupp.c
deleted file mode 100644
index c778add1b0..0000000000
--- a/3rdparty/libjpeg/transupp.c
+++ /dev/null
@@ -1,1533 +0,0 @@
-/*
- * transupp.c
- *
- * Copyright (C) 1997-2001, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file contains image transformation routines and other utility code
- * used by the jpegtran sample application.  These are NOT part of the core
- * JPEG library.  But we keep these routines separate from jpegtran.c to
- * ease the task of maintaining jpegtran-like programs that have other user
- * interfaces.
- */
-
-/* Although this file really shouldn't have access to the library internals,
- * it's helpful to let it call jround_up() and jcopy_block_row().
- */
-#define JPEG_INTERNALS
-
-#include "jinclude.h"
-#include "jpeglib.h"
-#include "transupp.h"		/* My own external interface */
-#include <ctype.h>		/* to declare isdigit() */
-
-
-#if TRANSFORMS_SUPPORTED
-
-/*
- * Lossless image transformation routines.  These routines work on DCT
- * coefficient arrays and thus do not require any lossy decompression
- * or recompression of the image.
- * Thanks to Guido Vollbeding for the initial design and code of this feature,
- * and to Ben Jackson for introducing the cropping feature.
- *
- * Horizontal flipping is done in-place, using a single top-to-bottom
- * pass through the virtual source array.  It will thus be much the
- * fastest option for images larger than main memory.
- *
- * The other routines require a set of destination virtual arrays, so they
- * need twice as much memory as jpegtran normally does.  The destination
- * arrays are always written in normal scan order (top to bottom) because
- * the virtual array manager expects this.  The source arrays will be scanned
- * in the corresponding order, which means multiple passes through the source
- * arrays for most of the transforms.  That could result in much thrashing
- * if the image is larger than main memory.
- *
- * If cropping or trimming is involved, the destination arrays may be smaller
- * than the source arrays.  Note it is not possible to do horizontal flip
- * in-place when a nonzero Y crop offset is specified, since we'd have to move
- * data from one block row to another but the virtual array manager doesn't
- * guarantee we can touch more than one row at a time.  So in that case,
- * we have to use a separate destination array.
- *
- * Some notes about the operating environment of the individual transform
- * routines:
- * 1. Both the source and destination virtual arrays are allocated from the
- *    source JPEG object, and therefore should be manipulated by calling the
- *    source's memory manager.
- * 2. The destination's component count should be used.  It may be smaller
- *    than the source's when forcing to grayscale.
- * 3. Likewise the destination's sampling factors should be used.  When
- *    forcing to grayscale the destination's sampling factors will be all 1,
- *    and we may as well take that as the effective iMCU size.
- * 4. When "trim" is in effect, the destination's dimensions will be the
- *    trimmed values but the source's will be untrimmed.
- * 5. When "crop" is in effect, the destination's dimensions will be the
- *    cropped values but the source's will be uncropped.  Each transform
- *    routine is responsible for picking up source data starting at the
- *    correct X and Y offset for the crop region.  (The X and Y offsets
- *    passed to the transform routines are measured in iMCU blocks of the
- *    destination.)
- * 6. All the routines assume that the source and destination buffers are
- *    padded out to a full iMCU boundary.  This is true, although for the
- *    source buffer it is an undocumented property of jdcoefct.c.
- */
-
-
-LOCAL(void)
-do_crop (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-	 JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
-	 jvirt_barray_ptr *src_coef_arrays,
-	 jvirt_barray_ptr *dst_coef_arrays)
-/* Crop.  This is only used when no rotate/flip is requested with the crop. */
-{
-  JDIMENSION dst_blk_y, x_crop_blocks, y_crop_blocks;
-  int ci, offset_y;
-  JBLOCKARRAY src_buffer, dst_buffer;
-  jpeg_component_info *compptr;
-
-  /* We simply have to copy the right amount of data (the destination's
-   * image size) starting at the given X and Y offsets in the source.
-   */
-  for (ci = 0; ci < dstinfo->num_components; ci++) {
-    compptr = dstinfo->comp_info + ci;
-    x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
-    y_crop_blocks = y_crop_offset * compptr->v_samp_factor;
-    for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
-	 dst_blk_y += compptr->v_samp_factor) {
-      dst_buffer = (*srcinfo->mem->access_virt_barray)
-	((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y,
-	 (JDIMENSION) compptr->v_samp_factor, TRUE);
-      src_buffer = (*srcinfo->mem->access_virt_barray)
-	((j_common_ptr) srcinfo, src_coef_arrays[ci],
-	 dst_blk_y + y_crop_blocks,
-	 (JDIMENSION) compptr->v_samp_factor, FALSE);
-      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
-	jcopy_block_row(src_buffer[offset_y] + x_crop_blocks,
-			dst_buffer[offset_y],
-			compptr->width_in_blocks);
-      }
-    }
-  }
-}
-
-
-LOCAL(void)
-do_flip_h_no_crop (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-		   JDIMENSION x_crop_offset,
-		   jvirt_barray_ptr *src_coef_arrays)
-/* Horizontal flip; done in-place, so no separate dest array is required.
- * NB: this only works when y_crop_offset is zero.
- */
-{
-  JDIMENSION MCU_cols, comp_width, blk_x, blk_y, x_crop_blocks;
-  int ci, k, offset_y;
-  JBLOCKARRAY buffer;
-  JCOEFPTR ptr1, ptr2;
-  JCOEF temp1, temp2;
-  jpeg_component_info *compptr;
-
-  /* Horizontal mirroring of DCT blocks is accomplished by swapping
-   * pairs of blocks in-place.  Within a DCT block, we perform horizontal
-   * mirroring by changing the signs of odd-numbered columns.
-   * Partial iMCUs at the right edge are left untouched.
-   */
-  MCU_cols = srcinfo->image_width / (dstinfo->max_h_samp_factor * DCTSIZE);
-
-  for (ci = 0; ci < dstinfo->num_components; ci++) {
-    compptr = dstinfo->comp_info + ci;
-    comp_width = MCU_cols * compptr->h_samp_factor;
-    x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
-    for (blk_y = 0; blk_y < compptr->height_in_blocks;
-	 blk_y += compptr->v_samp_factor) {
-      buffer = (*srcinfo->mem->access_virt_barray)
-	((j_common_ptr) srcinfo, src_coef_arrays[ci], blk_y,
-	 (JDIMENSION) compptr->v_samp_factor, TRUE);
-      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
-	/* Do the mirroring */
-	for (blk_x = 0; blk_x * 2 < comp_width; blk_x++) {
-	  ptr1 = buffer[offset_y][blk_x];
-	  ptr2 = buffer[offset_y][comp_width - blk_x - 1];
-	  /* this unrolled loop doesn't need to know which row it's on... */
-	  for (k = 0; k < DCTSIZE2; k += 2) {
-	    temp1 = *ptr1;	/* swap even column */
-	    temp2 = *ptr2;
-	    *ptr1++ = temp2;
-	    *ptr2++ = temp1;
-	    temp1 = *ptr1;	/* swap odd column with sign change */
-	    temp2 = *ptr2;
-	    *ptr1++ = -temp2;
-	    *ptr2++ = -temp1;
-	  }
-	}
-	if (x_crop_blocks > 0) {
-	  /* Now left-justify the portion of the data to be kept.
-	   * We can't use a single jcopy_block_row() call because that routine
-	   * depends on memcpy(), whose behavior is unspecified for overlapping
-	   * source and destination areas.  Sigh.
-	   */
-	  for (blk_x = 0; blk_x < compptr->width_in_blocks; blk_x++) {
-	    jcopy_block_row(buffer[offset_y] + blk_x + x_crop_blocks,
-			    buffer[offset_y] + blk_x,
-			    (JDIMENSION) 1);
-	  }
-	}
-      }
-    }
-  }
-}
-
-
-LOCAL(void)
-do_flip_h (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-	   JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
-	   jvirt_barray_ptr *src_coef_arrays,
-	   jvirt_barray_ptr *dst_coef_arrays)
-/* Horizontal flip in general cropping case */
-{
-  JDIMENSION MCU_cols, comp_width, dst_blk_x, dst_blk_y;
-  JDIMENSION x_crop_blocks, y_crop_blocks;
-  int ci, k, offset_y;
-  JBLOCKARRAY src_buffer, dst_buffer;
-  JBLOCKROW src_row_ptr, dst_row_ptr;
-  JCOEFPTR src_ptr, dst_ptr;
-  jpeg_component_info *compptr;
-
-  /* Here we must output into a separate array because we can't touch
-   * different rows of a single virtual array simultaneously.  Otherwise,
-   * this is essentially the same as the routine above.
-   */
-  MCU_cols = srcinfo->image_width / (dstinfo->max_h_samp_factor * DCTSIZE);
-
-  for (ci = 0; ci < dstinfo->num_components; ci++) {
-    compptr = dstinfo->comp_info + ci;
-    comp_width = MCU_cols * compptr->h_samp_factor;
-    x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
-    y_crop_blocks = y_crop_offset * compptr->v_samp_factor;
-    for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
-	 dst_blk_y += compptr->v_samp_factor) {
-      dst_buffer = (*srcinfo->mem->access_virt_barray)
-	((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y,
-	 (JDIMENSION) compptr->v_samp_factor, TRUE);
-      src_buffer = (*srcinfo->mem->access_virt_barray)
-	((j_common_ptr) srcinfo, src_coef_arrays[ci],
-	 dst_blk_y + y_crop_blocks,
-	 (JDIMENSION) compptr->v_samp_factor, FALSE);
-      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
-	dst_row_ptr = dst_buffer[offset_y];
-	src_row_ptr = src_buffer[offset_y];
-	for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; dst_blk_x++) {
-	  if (x_crop_blocks + dst_blk_x < comp_width) {
-	    /* Do the mirrorable blocks */
-	    dst_ptr = dst_row_ptr[dst_blk_x];
-	    src_ptr = src_row_ptr[comp_width - x_crop_blocks - dst_blk_x - 1];
-	    /* this unrolled loop doesn't need to know which row it's on... */
-	    for (k = 0; k < DCTSIZE2; k += 2) {
-	      *dst_ptr++ = *src_ptr++;	 /* copy even column */
-	      *dst_ptr++ = - *src_ptr++; /* copy odd column with sign change */
-	    }
-	  } else {
-	    /* Copy last partial block(s) verbatim */
-	    jcopy_block_row(src_row_ptr + dst_blk_x + x_crop_blocks,
-			    dst_row_ptr + dst_blk_x,
-			    (JDIMENSION) 1);
-	  }
-	}
-      }
-    }
-  }
-}
-
-
-LOCAL(void)
-do_flip_v (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-	   JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
-	   jvirt_barray_ptr *src_coef_arrays,
-	   jvirt_barray_ptr *dst_coef_arrays)
-/* Vertical flip */
-{
-  JDIMENSION MCU_rows, comp_height, dst_blk_x, dst_blk_y;
-  JDIMENSION x_crop_blocks, y_crop_blocks;
-  int ci, i, j, offset_y;
-  JBLOCKARRAY src_buffer, dst_buffer;
-  JBLOCKROW src_row_ptr, dst_row_ptr;
-  JCOEFPTR src_ptr, dst_ptr;
-  jpeg_component_info *compptr;
-
-  /* We output into a separate array because we can't touch different
-   * rows of the source virtual array simultaneously.  Otherwise, this
-   * is a pretty straightforward analog of horizontal flip.
-   * Within a DCT block, vertical mirroring is done by changing the signs
-   * of odd-numbered rows.
-   * Partial iMCUs at the bottom edge are copied verbatim.
-   */
-  MCU_rows = srcinfo->image_height / (dstinfo->max_v_samp_factor * DCTSIZE);
-
-  for (ci = 0; ci < dstinfo->num_components; ci++) {
-    compptr = dstinfo->comp_info + ci;
-    comp_height = MCU_rows * compptr->v_samp_factor;
-    x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
-    y_crop_blocks = y_crop_offset * compptr->v_samp_factor;
-    for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
-	 dst_blk_y += compptr->v_samp_factor) {
-      dst_buffer = (*srcinfo->mem->access_virt_barray)
-	((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y,
-	 (JDIMENSION) compptr->v_samp_factor, TRUE);
-      if (y_crop_blocks + dst_blk_y < comp_height) {
-	/* Row is within the mirrorable area. */
-	src_buffer = (*srcinfo->mem->access_virt_barray)
-	  ((j_common_ptr) srcinfo, src_coef_arrays[ci],
-	   comp_height - y_crop_blocks - dst_blk_y -
-	   (JDIMENSION) compptr->v_samp_factor,
-	   (JDIMENSION) compptr->v_samp_factor, FALSE);
-      } else {
-	/* Bottom-edge blocks will be copied verbatim. */
-	src_buffer = (*srcinfo->mem->access_virt_barray)
-	  ((j_common_ptr) srcinfo, src_coef_arrays[ci],
-	   dst_blk_y + y_crop_blocks,
-	   (JDIMENSION) compptr->v_samp_factor, FALSE);
-      }
-      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
-	if (y_crop_blocks + dst_blk_y < comp_height) {
-	  /* Row is within the mirrorable area. */
-	  dst_row_ptr = dst_buffer[offset_y];
-	  src_row_ptr = src_buffer[compptr->v_samp_factor - offset_y - 1];
-	  src_row_ptr += x_crop_blocks;
-	  for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks;
-	       dst_blk_x++) {
-	    dst_ptr = dst_row_ptr[dst_blk_x];
-	    src_ptr = src_row_ptr[dst_blk_x];
-	    for (i = 0; i < DCTSIZE; i += 2) {
-	      /* copy even row */
-	      for (j = 0; j < DCTSIZE; j++)
-		*dst_ptr++ = *src_ptr++;
-	      /* copy odd row with sign change */
-	      for (j = 0; j < DCTSIZE; j++)
-		*dst_ptr++ = - *src_ptr++;
-	    }
-	  }
-	} else {
-	  /* Just copy row verbatim. */
-	  jcopy_block_row(src_buffer[offset_y] + x_crop_blocks,
-			  dst_buffer[offset_y],
-			  compptr->width_in_blocks);
-	}
-      }
-    }
-  }
-}
-
-
-LOCAL(void)
-do_transpose (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-	      JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
-	      jvirt_barray_ptr *src_coef_arrays,
-	      jvirt_barray_ptr *dst_coef_arrays)
-/* Transpose source into destination */
-{
-  JDIMENSION dst_blk_x, dst_blk_y, x_crop_blocks, y_crop_blocks;
-  int ci, i, j, offset_x, offset_y;
-  JBLOCKARRAY src_buffer, dst_buffer;
-  JCOEFPTR src_ptr, dst_ptr;
-  jpeg_component_info *compptr;
-
-  /* Transposing pixels within a block just requires transposing the
-   * DCT coefficients.
-   * Partial iMCUs at the edges require no special treatment; we simply
-   * process all the available DCT blocks for every component.
-   */
-  for (ci = 0; ci < dstinfo->num_components; ci++) {
-    compptr = dstinfo->comp_info + ci;
-    x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
-    y_crop_blocks = y_crop_offset * compptr->v_samp_factor;
-    for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
-	 dst_blk_y += compptr->v_samp_factor) {
-      dst_buffer = (*srcinfo->mem->access_virt_barray)
-	((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y,
-	 (JDIMENSION) compptr->v_samp_factor, TRUE);
-      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
-	for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks;
-	     dst_blk_x += compptr->h_samp_factor) {
-	  src_buffer = (*srcinfo->mem->access_virt_barray)
-	    ((j_common_ptr) srcinfo, src_coef_arrays[ci],
-	     dst_blk_x + x_crop_blocks,
-	     (JDIMENSION) compptr->h_samp_factor, FALSE);
-	  for (offset_x = 0; offset_x < compptr->h_samp_factor; offset_x++) {
-	    dst_ptr = dst_buffer[offset_y][dst_blk_x + offset_x];
-	    src_ptr = src_buffer[offset_x][dst_blk_y + offset_y + y_crop_blocks];
-	    for (i = 0; i < DCTSIZE; i++)
-	      for (j = 0; j < DCTSIZE; j++)
-		dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
-	  }
-	}
-      }
-    }
-  }
-}
-
-
-LOCAL(void)
-do_rot_90 (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-	   JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
-	   jvirt_barray_ptr *src_coef_arrays,
-	   jvirt_barray_ptr *dst_coef_arrays)
-/* 90 degree rotation is equivalent to
- *   1. Transposing the image;
- *   2. Horizontal mirroring.
- * These two steps are merged into a single processing routine.
- */
-{
-  JDIMENSION MCU_cols, comp_width, dst_blk_x, dst_blk_y;
-  JDIMENSION x_crop_blocks, y_crop_blocks;
-  int ci, i, j, offset_x, offset_y;
-  JBLOCKARRAY src_buffer, dst_buffer;
-  JCOEFPTR src_ptr, dst_ptr;
-  jpeg_component_info *compptr;
-
-  /* Because of the horizontal mirror step, we can't process partial iMCUs
-   * at the (output) right edge properly.  They just get transposed and
-   * not mirrored.
-   */
-  MCU_cols = srcinfo->image_height / (dstinfo->max_h_samp_factor * DCTSIZE);
-
-  for (ci = 0; ci < dstinfo->num_components; ci++) {
-    compptr = dstinfo->comp_info + ci;
-    comp_width = MCU_cols * compptr->h_samp_factor;
-    x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
-    y_crop_blocks = y_crop_offset * compptr->v_samp_factor;
-    for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
-	 dst_blk_y += compptr->v_samp_factor) {
-      dst_buffer = (*srcinfo->mem->access_virt_barray)
-	((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y,
-	 (JDIMENSION) compptr->v_samp_factor, TRUE);
-      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
-	for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks;
-	     dst_blk_x += compptr->h_samp_factor) {
-	  if (x_crop_blocks + dst_blk_x < comp_width) {
-	    /* Block is within the mirrorable area. */
-	    src_buffer = (*srcinfo->mem->access_virt_barray)
-	      ((j_common_ptr) srcinfo, src_coef_arrays[ci],
-	       comp_width - x_crop_blocks - dst_blk_x -
-	       (JDIMENSION) compptr->h_samp_factor,
-	       (JDIMENSION) compptr->h_samp_factor, FALSE);
-	  } else {
-	    /* Edge blocks are transposed but not mirrored. */
-	    src_buffer = (*srcinfo->mem->access_virt_barray)
-	      ((j_common_ptr) srcinfo, src_coef_arrays[ci],
-	       dst_blk_x + x_crop_blocks,
-	       (JDIMENSION) compptr->h_samp_factor, FALSE);
-	  }
-	  for (offset_x = 0; offset_x < compptr->h_samp_factor; offset_x++) {
-	    dst_ptr = dst_buffer[offset_y][dst_blk_x + offset_x];
-	    if (x_crop_blocks + dst_blk_x < comp_width) {
-	      /* Block is within the mirrorable area. */
-	      src_ptr = src_buffer[compptr->h_samp_factor - offset_x - 1]
-		[dst_blk_y + offset_y + y_crop_blocks];
-	      for (i = 0; i < DCTSIZE; i++) {
-		for (j = 0; j < DCTSIZE; j++)
-		  dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
-		i++;
-		for (j = 0; j < DCTSIZE; j++)
-		  dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j];
-	      }
-	    } else {
-	      /* Edge blocks are transposed but not mirrored. */
-	      src_ptr = src_buffer[offset_x]
-		[dst_blk_y + offset_y + y_crop_blocks];
-	      for (i = 0; i < DCTSIZE; i++)
-		for (j = 0; j < DCTSIZE; j++)
-		  dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
-	    }
-	  }
-	}
-      }
-    }
-  }
-}
-
-
-LOCAL(void)
-do_rot_270 (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-	    JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
-	    jvirt_barray_ptr *src_coef_arrays,
-	    jvirt_barray_ptr *dst_coef_arrays)
-/* 270 degree rotation is equivalent to
- *   1. Horizontal mirroring;
- *   2. Transposing the image.
- * These two steps are merged into a single processing routine.
- */
-{
-  JDIMENSION MCU_rows, comp_height, dst_blk_x, dst_blk_y;
-  JDIMENSION x_crop_blocks, y_crop_blocks;
-  int ci, i, j, offset_x, offset_y;
-  JBLOCKARRAY src_buffer, dst_buffer;
-  JCOEFPTR src_ptr, dst_ptr;
-  jpeg_component_info *compptr;
-
-  /* Because of the horizontal mirror step, we can't process partial iMCUs
-   * at the (output) bottom edge properly.  They just get transposed and
-   * not mirrored.
-   */
-  MCU_rows = srcinfo->image_width / (dstinfo->max_v_samp_factor * DCTSIZE);
-
-  for (ci = 0; ci < dstinfo->num_components; ci++) {
-    compptr = dstinfo->comp_info + ci;
-    comp_height = MCU_rows * compptr->v_samp_factor;
-    x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
-    y_crop_blocks = y_crop_offset * compptr->v_samp_factor;
-    for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
-	 dst_blk_y += compptr->v_samp_factor) {
-      dst_buffer = (*srcinfo->mem->access_virt_barray)
-	((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y,
-	 (JDIMENSION) compptr->v_samp_factor, TRUE);
-      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
-	for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks;
-	     dst_blk_x += compptr->h_samp_factor) {
-	  src_buffer = (*srcinfo->mem->access_virt_barray)
-	    ((j_common_ptr) srcinfo, src_coef_arrays[ci],
-	     dst_blk_x + x_crop_blocks,
-	     (JDIMENSION) compptr->h_samp_factor, FALSE);
-	  for (offset_x = 0; offset_x < compptr->h_samp_factor; offset_x++) {
-	    dst_ptr = dst_buffer[offset_y][dst_blk_x + offset_x];
-	    if (y_crop_blocks + dst_blk_y < comp_height) {
-	      /* Block is within the mirrorable area. */
-	      src_ptr = src_buffer[offset_x]
-		[comp_height - y_crop_blocks - dst_blk_y - offset_y - 1];
-	      for (i = 0; i < DCTSIZE; i++) {
-		for (j = 0; j < DCTSIZE; j++) {
-		  dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
-		  j++;
-		  dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j];
-		}
-	      }
-	    } else {
-	      /* Edge blocks are transposed but not mirrored. */
-	      src_ptr = src_buffer[offset_x]
-		[dst_blk_y + offset_y + y_crop_blocks];
-	      for (i = 0; i < DCTSIZE; i++)
-		for (j = 0; j < DCTSIZE; j++)
-		  dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
-	    }
-	  }
-	}
-      }
-    }
-  }
-}
-
-
-LOCAL(void)
-do_rot_180 (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-	    JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
-	    jvirt_barray_ptr *src_coef_arrays,
-	    jvirt_barray_ptr *dst_coef_arrays)
-/* 180 degree rotation is equivalent to
- *   1. Vertical mirroring;
- *   2. Horizontal mirroring.
- * These two steps are merged into a single processing routine.
- */
-{
-  JDIMENSION MCU_cols, MCU_rows, comp_width, comp_height, dst_blk_x, dst_blk_y;
-  JDIMENSION x_crop_blocks, y_crop_blocks;
-  int ci, i, j, offset_y;
-  JBLOCKARRAY src_buffer, dst_buffer;
-  JBLOCKROW src_row_ptr, dst_row_ptr;
-  JCOEFPTR src_ptr, dst_ptr;
-  jpeg_component_info *compptr;
-
-  MCU_cols = srcinfo->image_width / (dstinfo->max_h_samp_factor * DCTSIZE);
-  MCU_rows = srcinfo->image_height / (dstinfo->max_v_samp_factor * DCTSIZE);
-
-  for (ci = 0; ci < dstinfo->num_components; ci++) {
-    compptr = dstinfo->comp_info + ci;
-    comp_width = MCU_cols * compptr->h_samp_factor;
-    comp_height = MCU_rows * compptr->v_samp_factor;
-    x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
-    y_crop_blocks = y_crop_offset * compptr->v_samp_factor;
-    for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
-	 dst_blk_y += compptr->v_samp_factor) {
-      dst_buffer = (*srcinfo->mem->access_virt_barray)
-	((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y,
-	 (JDIMENSION) compptr->v_samp_factor, TRUE);
-      if (y_crop_blocks + dst_blk_y < comp_height) {
-	/* Row is within the vertically mirrorable area. */
-	src_buffer = (*srcinfo->mem->access_virt_barray)
-	  ((j_common_ptr) srcinfo, src_coef_arrays[ci],
-	   comp_height - y_crop_blocks - dst_blk_y -
-	   (JDIMENSION) compptr->v_samp_factor,
-	   (JDIMENSION) compptr->v_samp_factor, FALSE);
-      } else {
-	/* Bottom-edge rows are only mirrored horizontally. */
-	src_buffer = (*srcinfo->mem->access_virt_barray)
-	  ((j_common_ptr) srcinfo, src_coef_arrays[ci],
-	   dst_blk_y + y_crop_blocks,
-	   (JDIMENSION) compptr->v_samp_factor, FALSE);
-      }
-      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
-	dst_row_ptr = dst_buffer[offset_y];
-	if (y_crop_blocks + dst_blk_y < comp_height) {
-	  /* Row is within the mirrorable area. */
-	  src_row_ptr = src_buffer[compptr->v_samp_factor - offset_y - 1];
-	  for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; dst_blk_x++) {
-	    dst_ptr = dst_row_ptr[dst_blk_x];
-	    if (x_crop_blocks + dst_blk_x < comp_width) {
-	      /* Process the blocks that can be mirrored both ways. */
-	      src_ptr = src_row_ptr[comp_width - x_crop_blocks - dst_blk_x - 1];
-	      for (i = 0; i < DCTSIZE; i += 2) {
-		/* For even row, negate every odd column. */
-		for (j = 0; j < DCTSIZE; j += 2) {
-		  *dst_ptr++ = *src_ptr++;
-		  *dst_ptr++ = - *src_ptr++;
-		}
-		/* For odd row, negate every even column. */
-		for (j = 0; j < DCTSIZE; j += 2) {
-		  *dst_ptr++ = - *src_ptr++;
-		  *dst_ptr++ = *src_ptr++;
-		}
-	      }
-	    } else {
-	      /* Any remaining right-edge blocks are only mirrored vertically. */
-	      src_ptr = src_row_ptr[x_crop_blocks + dst_blk_x];
-	      for (i = 0; i < DCTSIZE; i += 2) {
-		for (j = 0; j < DCTSIZE; j++)
-		  *dst_ptr++ = *src_ptr++;
-		for (j = 0; j < DCTSIZE; j++)
-		  *dst_ptr++ = - *src_ptr++;
-	      }
-	    }
-	  }
-	} else {
-	  /* Remaining rows are just mirrored horizontally. */
-	  src_row_ptr = src_buffer[offset_y];
-	  for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; dst_blk_x++) {
-	    if (x_crop_blocks + dst_blk_x < comp_width) {
-	      /* Process the blocks that can be mirrored. */
-	      dst_ptr = dst_row_ptr[dst_blk_x];
-	      src_ptr = src_row_ptr[comp_width - x_crop_blocks - dst_blk_x - 1];
-	      for (i = 0; i < DCTSIZE2; i += 2) {
-		*dst_ptr++ = *src_ptr++;
-		*dst_ptr++ = - *src_ptr++;
-	      }
-	    } else {
-	      /* Any remaining right-edge blocks are only copied. */
-	      jcopy_block_row(src_row_ptr + dst_blk_x + x_crop_blocks,
-			      dst_row_ptr + dst_blk_x,
-			      (JDIMENSION) 1);
-	    }
-	  }
-	}
-      }
-    }
-  }
-}
-
-
-LOCAL(void)
-do_transverse (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-	       JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
-	       jvirt_barray_ptr *src_coef_arrays,
-	       jvirt_barray_ptr *dst_coef_arrays)
-/* Transverse transpose is equivalent to
- *   1. 180 degree rotation;
- *   2. Transposition;
- * or
- *   1. Horizontal mirroring;
- *   2. Transposition;
- *   3. Horizontal mirroring.
- * These steps are merged into a single processing routine.
- */
-{
-  JDIMENSION MCU_cols, MCU_rows, comp_width, comp_height, dst_blk_x, dst_blk_y;
-  JDIMENSION x_crop_blocks, y_crop_blocks;
-  int ci, i, j, offset_x, offset_y;
-  JBLOCKARRAY src_buffer, dst_buffer;
-  JCOEFPTR src_ptr, dst_ptr;
-  jpeg_component_info *compptr;
-
-  MCU_cols = srcinfo->image_height / (dstinfo->max_h_samp_factor * DCTSIZE);
-  MCU_rows = srcinfo->image_width / (dstinfo->max_v_samp_factor * DCTSIZE);
-
-  for (ci = 0; ci < dstinfo->num_components; ci++) {
-    compptr = dstinfo->comp_info + ci;
-    comp_width = MCU_cols * compptr->h_samp_factor;
-    comp_height = MCU_rows * compptr->v_samp_factor;
-    x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
-    y_crop_blocks = y_crop_offset * compptr->v_samp_factor;
-    for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
-	 dst_blk_y += compptr->v_samp_factor) {
-      dst_buffer = (*srcinfo->mem->access_virt_barray)
-	((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y,
-	 (JDIMENSION) compptr->v_samp_factor, TRUE);
-      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
-	for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks;
-	     dst_blk_x += compptr->h_samp_factor) {
-	  if (x_crop_blocks + dst_blk_x < comp_width) {
-	    /* Block is within the mirrorable area. */
-	    src_buffer = (*srcinfo->mem->access_virt_barray)
-	      ((j_common_ptr) srcinfo, src_coef_arrays[ci],
-	       comp_width - x_crop_blocks - dst_blk_x -
-	       (JDIMENSION) compptr->h_samp_factor,
-	       (JDIMENSION) compptr->h_samp_factor, FALSE);
-	  } else {
-	    src_buffer = (*srcinfo->mem->access_virt_barray)
-	      ((j_common_ptr) srcinfo, src_coef_arrays[ci],
-	       dst_blk_x + x_crop_blocks,
-	       (JDIMENSION) compptr->h_samp_factor, FALSE);
-	  }
-	  for (offset_x = 0; offset_x < compptr->h_samp_factor; offset_x++) {
-	    dst_ptr = dst_buffer[offset_y][dst_blk_x + offset_x];
-	    if (y_crop_blocks + dst_blk_y < comp_height) {
-	      if (x_crop_blocks + dst_blk_x < comp_width) {
-		/* Block is within the mirrorable area. */
-		src_ptr = src_buffer[compptr->h_samp_factor - offset_x - 1]
-		  [comp_height - y_crop_blocks - dst_blk_y - offset_y - 1];
-		for (i = 0; i < DCTSIZE; i++) {
-		  for (j = 0; j < DCTSIZE; j++) {
-		    dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
-		    j++;
-		    dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j];
-		  }
-		  i++;
-		  for (j = 0; j < DCTSIZE; j++) {
-		    dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j];
-		    j++;
-		    dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
-		  }
-		}
-	      } else {
-		/* Right-edge blocks are mirrored in y only */
-		src_ptr = src_buffer[offset_x]
-		  [comp_height - y_crop_blocks - dst_blk_y - offset_y - 1];
-		for (i = 0; i < DCTSIZE; i++) {
-		  for (j = 0; j < DCTSIZE; j++) {
-		    dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
-		    j++;
-		    dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j];
-		  }
-		}
-	      }
-	    } else {
-	      if (x_crop_blocks + dst_blk_x < comp_width) {
-		/* Bottom-edge blocks are mirrored in x only */
-		src_ptr = src_buffer[compptr->h_samp_factor - offset_x - 1]
-		  [dst_blk_y + offset_y + y_crop_blocks];
-		for (i = 0; i < DCTSIZE; i++) {
-		  for (j = 0; j < DCTSIZE; j++)
-		    dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
-		  i++;
-		  for (j = 0; j < DCTSIZE; j++)
-		    dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j];
-		}
-	      } else {
-		/* At lower right corner, just transpose, no mirroring */
-		src_ptr = src_buffer[offset_x]
-		  [dst_blk_y + offset_y + y_crop_blocks];
-		for (i = 0; i < DCTSIZE; i++)
-		  for (j = 0; j < DCTSIZE; j++)
-		    dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
-	      }
-	    }
-	  }
-	}
-      }
-    }
-  }
-}
-
-
-/* Parse an unsigned integer: subroutine for jtransform_parse_crop_spec.
- * Returns TRUE if valid integer found, FALSE if not.
- * *strptr is advanced over the digit string, and *result is set to its value.
- */
-
-LOCAL(boolean)
-jt_read_integer (const char ** strptr, JDIMENSION * result)
-{
-  const char * ptr = *strptr;
-  JDIMENSION val = 0;
-
-  for (; isdigit(*ptr); ptr++) {
-    val = val * 10 + (JDIMENSION) (*ptr - '0');
-  }
-  *result = val;
-  if (ptr == *strptr)
-    return FALSE;		/* oops, no digits */
-  *strptr = ptr;
-  return TRUE;
-}
-
-
-/* Parse a crop specification (written in X11 geometry style).
- * The routine returns TRUE if the spec string is valid, FALSE if not.
- *
- * The crop spec string should have the format
- *	<width>x<height>{+-}<xoffset>{+-}<yoffset>
- * where width, height, xoffset, and yoffset are unsigned integers.
- * Each of the elements can be omitted to indicate a default value.
- * (A weakness of this style is that it is not possible to omit xoffset
- * while specifying yoffset, since they look alike.)
- *
- * This code is loosely based on XParseGeometry from the X11 distribution.
- */
-
-GLOBAL(boolean)
-jtransform_parse_crop_spec (jpeg_transform_info *info, const char *spec)
-{
-  info->crop = FALSE;
-  info->crop_width_set = JCROP_UNSET;
-  info->crop_height_set = JCROP_UNSET;
-  info->crop_xoffset_set = JCROP_UNSET;
-  info->crop_yoffset_set = JCROP_UNSET;
-
-  if (isdigit(*spec)) {
-    /* fetch width */
-    if (! jt_read_integer(&spec, &info->crop_width))
-      return FALSE;
-    info->crop_width_set = JCROP_POS;
-  }
-  if (*spec == 'x' || *spec == 'X') {	
-    /* fetch height */
-    spec++;
-    if (! jt_read_integer(&spec, &info->crop_height))
-      return FALSE;
-    info->crop_height_set = JCROP_POS;
-  }
-  if (*spec == '+' || *spec == '-') {
-    /* fetch xoffset */
-    info->crop_xoffset_set = (*spec == '-') ? JCROP_NEG : JCROP_POS;
-    spec++;
-    if (! jt_read_integer(&spec, &info->crop_xoffset))
-      return FALSE;
-  }
-  if (*spec == '+' || *spec == '-') {
-    /* fetch yoffset */
-    info->crop_yoffset_set = (*spec == '-') ? JCROP_NEG : JCROP_POS;
-    spec++;
-    if (! jt_read_integer(&spec, &info->crop_yoffset))
-      return FALSE;
-  }
-  /* We had better have gotten to the end of the string. */
-  if (*spec != '\0')
-    return FALSE;
-  info->crop = TRUE;
-  return TRUE;
-}
-
-
-/* Trim off any partial iMCUs on the indicated destination edge */
-
-LOCAL(void)
-trim_right_edge (jpeg_transform_info *info, JDIMENSION full_width)
-{
-  JDIMENSION MCU_cols;
-
-  MCU_cols = info->output_width / (info->max_h_samp_factor * DCTSIZE);
-  if (MCU_cols > 0 && info->x_crop_offset + MCU_cols ==
-      full_width / (info->max_h_samp_factor * DCTSIZE))
-    info->output_width = MCU_cols * (info->max_h_samp_factor * DCTSIZE);
-}
-
-LOCAL(void)
-trim_bottom_edge (jpeg_transform_info *info, JDIMENSION full_height)
-{
-  JDIMENSION MCU_rows;
-
-  MCU_rows = info->output_height / (info->max_v_samp_factor * DCTSIZE);
-  if (MCU_rows > 0 && info->y_crop_offset + MCU_rows ==
-      full_height / (info->max_v_samp_factor * DCTSIZE))
-    info->output_height = MCU_rows * (info->max_v_samp_factor * DCTSIZE);
-}
-
-
-/* Request any required workspace.
- *
- * This routine figures out the size that the output image will be
- * (which implies that all the transform parameters must be set before
- * it is called).
- *
- * We allocate the workspace virtual arrays from the source decompression
- * object, so that all the arrays (both the original data and the workspace)
- * will be taken into account while making memory management decisions.
- * Hence, this routine must be called after jpeg_read_header (which reads
- * the image dimensions) and before jpeg_read_coefficients (which realizes
- * the source's virtual arrays).
- */
-
-GLOBAL(void)
-jtransform_request_workspace (j_decompress_ptr srcinfo,
-			      jpeg_transform_info *info)
-{
-  jvirt_barray_ptr *coef_arrays = NULL;
-  boolean need_workspace, transpose_it;
-  jpeg_component_info *compptr;
-  JDIMENSION xoffset, yoffset, width_in_iMCUs, height_in_iMCUs;
-  JDIMENSION width_in_blocks, height_in_blocks;
-  int ci, h_samp_factor, v_samp_factor;
-
-  /* Determine number of components in output image */
-  if (info->force_grayscale &&
-      srcinfo->jpeg_color_space == JCS_YCbCr &&
-      srcinfo->num_components == 3) {
-    /* We'll only process the first component */
-    info->num_components = 1;
-  } else {
-    /* Process all the components */
-    info->num_components = srcinfo->num_components;
-  }
-  /* If there is only one output component, force the iMCU size to be 1;
-   * else use the source iMCU size.  (This allows us to do the right thing
-   * when reducing color to grayscale, and also provides a handy way of
-   * cleaning up "funny" grayscale images whose sampling factors are not 1x1.)
-   */
-
-  switch (info->transform) {
-  case JXFORM_TRANSPOSE:
-  case JXFORM_TRANSVERSE:
-  case JXFORM_ROT_90:
-  case JXFORM_ROT_270:
-    info->output_width = srcinfo->image_height;
-    info->output_height = srcinfo->image_width;
-    if (info->num_components == 1) {
-      info->max_h_samp_factor = 1;
-      info->max_v_samp_factor = 1;
-    } else {
-      info->max_h_samp_factor = srcinfo->max_v_samp_factor;
-      info->max_v_samp_factor = srcinfo->max_h_samp_factor;
-    }
-    break;
-  default:
-    info->output_width = srcinfo->image_width;
-    info->output_height = srcinfo->image_height;
-    if (info->num_components == 1) {
-      info->max_h_samp_factor = 1;
-      info->max_v_samp_factor = 1;
-    } else {
-      info->max_h_samp_factor = srcinfo->max_h_samp_factor;
-      info->max_v_samp_factor = srcinfo->max_v_samp_factor;
-    }
-    break;
-  }
-
-  /* If cropping has been requested, compute the crop area's position and
-   * dimensions, ensuring that its upper left corner falls at an iMCU boundary.
-   */
-  if (info->crop) {
-    /* Insert default values for unset crop parameters */
-    if (info->crop_xoffset_set == JCROP_UNSET)
-      info->crop_xoffset = 0;	/* default to +0 */
-    if (info->crop_yoffset_set == JCROP_UNSET)
-      info->crop_yoffset = 0;	/* default to +0 */
-    if (info->crop_xoffset >= info->output_width ||
-	info->crop_yoffset >= info->output_height)
-      ERREXIT(srcinfo, JERR_BAD_CROP_SPEC);
-    if (info->crop_width_set == JCROP_UNSET)
-      info->crop_width = info->output_width - info->crop_xoffset;
-    if (info->crop_height_set == JCROP_UNSET)
-      info->crop_height = info->output_height - info->crop_yoffset;
-    /* Ensure parameters are valid */
-    if (info->crop_width <= 0 || info->crop_width > info->output_width ||
-	info->crop_height <= 0 || info->crop_height > info->output_height ||
-	info->crop_xoffset > info->output_width - info->crop_width ||
-	info->crop_yoffset > info->output_height - info->crop_height)
-      ERREXIT(srcinfo, JERR_BAD_CROP_SPEC);
-    /* Convert negative crop offsets into regular offsets */
-    if (info->crop_xoffset_set == JCROP_NEG)
-      xoffset = info->output_width - info->crop_width - info->crop_xoffset;
-    else
-      xoffset = info->crop_xoffset;
-    if (info->crop_yoffset_set == JCROP_NEG)
-      yoffset = info->output_height - info->crop_height - info->crop_yoffset;
-    else
-      yoffset = info->crop_yoffset;
-    /* Now adjust so that upper left corner falls at an iMCU boundary */
-    info->output_width =
-      info->crop_width + (xoffset % (info->max_h_samp_factor * DCTSIZE));
-    info->output_height =
-      info->crop_height + (yoffset % (info->max_v_samp_factor * DCTSIZE));
-    /* Save x/y offsets measured in iMCUs */
-    info->x_crop_offset = xoffset / (info->max_h_samp_factor * DCTSIZE);
-    info->y_crop_offset = yoffset / (info->max_v_samp_factor * DCTSIZE);
-  } else {
-    info->x_crop_offset = 0;
-    info->y_crop_offset = 0;
-  }
-
-  /* Figure out whether we need workspace arrays,
-   * and if so whether they are transposed relative to the source.
-   */
-  need_workspace = FALSE;
-  transpose_it = FALSE;
-  switch (info->transform) {
-  case JXFORM_NONE:
-    if (info->x_crop_offset != 0 || info->y_crop_offset != 0)
-      need_workspace = TRUE;
-    /* No workspace needed if neither cropping nor transforming */
-    break;
-  case JXFORM_FLIP_H:
-    if (info->trim)
-      trim_right_edge(info, srcinfo->image_width);
-    if (info->y_crop_offset != 0)
-      need_workspace = TRUE;
-    /* do_flip_h_no_crop doesn't need a workspace array */
-    break;
-  case JXFORM_FLIP_V:
-    if (info->trim)
-      trim_bottom_edge(info, srcinfo->image_height);
-    /* Need workspace arrays having same dimensions as source image. */
-    need_workspace = TRUE;
-    break;
-  case JXFORM_TRANSPOSE:
-    /* transpose does NOT have to trim anything */
-    /* Need workspace arrays having transposed dimensions. */
-    need_workspace = TRUE;
-    transpose_it = TRUE;
-    break;
-  case JXFORM_TRANSVERSE:
-    if (info->trim) {
-      trim_right_edge(info, srcinfo->image_height);
-      trim_bottom_edge(info, srcinfo->image_width);
-    }
-    /* Need workspace arrays having transposed dimensions. */
-    need_workspace = TRUE;
-    transpose_it = TRUE;
-    break;
-  case JXFORM_ROT_90:
-    if (info->trim)
-      trim_right_edge(info, srcinfo->image_height);
-    /* Need workspace arrays having transposed dimensions. */
-    need_workspace = TRUE;
-    transpose_it = TRUE;
-    break;
-  case JXFORM_ROT_180:
-    if (info->trim) {
-      trim_right_edge(info, srcinfo->image_width);
-      trim_bottom_edge(info, srcinfo->image_height);
-    }
-    /* Need workspace arrays having same dimensions as source image. */
-    need_workspace = TRUE;
-    break;
-  case JXFORM_ROT_270:
-    if (info->trim)
-      trim_bottom_edge(info, srcinfo->image_width);
-    /* Need workspace arrays having transposed dimensions. */
-    need_workspace = TRUE;
-    transpose_it = TRUE;
-    break;
-  }
-
-  /* Allocate workspace if needed.
-   * Note that we allocate arrays padded out to the next iMCU boundary,
-   * so that transform routines need not worry about missing edge blocks.
-   */
-  if (need_workspace) {
-    coef_arrays = (jvirt_barray_ptr *)
-      (*srcinfo->mem->alloc_small) ((j_common_ptr) srcinfo, JPOOL_IMAGE,
-		SIZEOF(jvirt_barray_ptr) * info->num_components);
-    width_in_iMCUs = (JDIMENSION)
-      jdiv_round_up((long) info->output_width,
-		    (long) (info->max_h_samp_factor * DCTSIZE));
-    height_in_iMCUs = (JDIMENSION)
-      jdiv_round_up((long) info->output_height,
-		    (long) (info->max_v_samp_factor * DCTSIZE));
-    for (ci = 0; ci < info->num_components; ci++) {
-      compptr = srcinfo->comp_info + ci;
-      if (info->num_components == 1) {
-	/* we're going to force samp factors to 1x1 in this case */
-	h_samp_factor = v_samp_factor = 1;
-      } else if (transpose_it) {
-	h_samp_factor = compptr->v_samp_factor;
-	v_samp_factor = compptr->h_samp_factor;
-      } else {
-	h_samp_factor = compptr->h_samp_factor;
-	v_samp_factor = compptr->v_samp_factor;
-      }
-      width_in_blocks = width_in_iMCUs * h_samp_factor;
-      height_in_blocks = height_in_iMCUs * v_samp_factor;
-      coef_arrays[ci] = (*srcinfo->mem->request_virt_barray)
-	((j_common_ptr) srcinfo, JPOOL_IMAGE, FALSE,
-	 width_in_blocks, height_in_blocks, (JDIMENSION) v_samp_factor);
-    }
-  }
-
-  info->workspace_coef_arrays = coef_arrays;
-}
-
-
-/* Transpose destination image parameters */
-
-LOCAL(void)
-transpose_critical_parameters (j_compress_ptr dstinfo)
-{
-  int tblno, i, j, ci, itemp;
-  jpeg_component_info *compptr;
-  JQUANT_TBL *qtblptr;
-  UINT16 qtemp;
-
-  /* Transpose sampling factors */
-  for (ci = 0; ci < dstinfo->num_components; ci++) {
-    compptr = dstinfo->comp_info + ci;
-    itemp = compptr->h_samp_factor;
-    compptr->h_samp_factor = compptr->v_samp_factor;
-    compptr->v_samp_factor = itemp;
-  }
-
-  /* Transpose quantization tables */
-  for (tblno = 0; tblno < NUM_QUANT_TBLS; tblno++) {
-    qtblptr = dstinfo->quant_tbl_ptrs[tblno];
-    if (qtblptr != NULL) {
-      for (i = 0; i < DCTSIZE; i++) {
-	for (j = 0; j < i; j++) {
-	  qtemp = qtblptr->quantval[i*DCTSIZE+j];
-	  qtblptr->quantval[i*DCTSIZE+j] = qtblptr->quantval[j*DCTSIZE+i];
-	  qtblptr->quantval[j*DCTSIZE+i] = qtemp;
-	}
-      }
-    }
-  }
-}
-
-
-/* Adjust Exif image parameters.
- *
- * We try to adjust the Tags ExifImageWidth and ExifImageHeight if possible.
- */
-
-LOCAL(void)
-adjust_exif_parameters (JOCTET FAR * data, unsigned int length,
-			JDIMENSION new_width, JDIMENSION new_height)
-{
-  boolean is_motorola; /* Flag for byte order */
-  unsigned int number_of_tags, tagnum;
-  unsigned int firstoffset, offset;
-  JDIMENSION new_value;
-
-  if (length < 12) return; /* Length of an IFD entry */
-
-  /* Discover byte order */
-  if (GETJOCTET(data[0]) == 0x49 && GETJOCTET(data[1]) == 0x49)
-    is_motorola = FALSE;
-  else if (GETJOCTET(data[0]) == 0x4D && GETJOCTET(data[1]) == 0x4D)
-    is_motorola = TRUE;
-  else
-    return;
-
-  /* Check Tag Mark */
-  if (is_motorola) {
-    if (GETJOCTET(data[2]) != 0) return;
-    if (GETJOCTET(data[3]) != 0x2A) return;
-  } else {
-    if (GETJOCTET(data[3]) != 0) return;
-    if (GETJOCTET(data[2]) != 0x2A) return;
-  }
-
-  /* Get first IFD offset (offset to IFD0) */
-  if (is_motorola) {
-    if (GETJOCTET(data[4]) != 0) return;
-    if (GETJOCTET(data[5]) != 0) return;
-    firstoffset = GETJOCTET(data[6]);
-    firstoffset <<= 8;
-    firstoffset += GETJOCTET(data[7]);
-  } else {
-    if (GETJOCTET(data[7]) != 0) return;
-    if (GETJOCTET(data[6]) != 0) return;
-    firstoffset = GETJOCTET(data[5]);
-    firstoffset <<= 8;
-    firstoffset += GETJOCTET(data[4]);
-  }
-  if (firstoffset > length - 2) return; /* check end of data segment */
-
-  /* Get the number of directory entries contained in this IFD */
-  if (is_motorola) {
-    number_of_tags = GETJOCTET(data[firstoffset]);
-    number_of_tags <<= 8;
-    number_of_tags += GETJOCTET(data[firstoffset+1]);
-  } else {
-    number_of_tags = GETJOCTET(data[firstoffset+1]);
-    number_of_tags <<= 8;
-    number_of_tags += GETJOCTET(data[firstoffset]);
-  }
-  if (number_of_tags == 0) return;
-  firstoffset += 2;
-
-  /* Search for ExifSubIFD offset Tag in IFD0 */
-  for (;;) {
-    if (firstoffset > length - 12) return; /* check end of data segment */
-    /* Get Tag number */
-    if (is_motorola) {
-      tagnum = GETJOCTET(data[firstoffset]);
-      tagnum <<= 8;
-      tagnum += GETJOCTET(data[firstoffset+1]);
-    } else {
-      tagnum = GETJOCTET(data[firstoffset+1]);
-      tagnum <<= 8;
-      tagnum += GETJOCTET(data[firstoffset]);
-    }
-    if (tagnum == 0x8769) break; /* found ExifSubIFD offset Tag */
-    if (--number_of_tags == 0) return;
-    firstoffset += 12;
-  }
-
-  /* Get the ExifSubIFD offset */
-  if (is_motorola) {
-    if (GETJOCTET(data[firstoffset+8]) != 0) return;
-    if (GETJOCTET(data[firstoffset+9]) != 0) return;
-    offset = GETJOCTET(data[firstoffset+10]);
-    offset <<= 8;
-    offset += GETJOCTET(data[firstoffset+11]);
-  } else {
-    if (GETJOCTET(data[firstoffset+11]) != 0) return;
-    if (GETJOCTET(data[firstoffset+10]) != 0) return;
-    offset = GETJOCTET(data[firstoffset+9]);
-    offset <<= 8;
-    offset += GETJOCTET(data[firstoffset+8]);
-  }
-  if (offset > length - 2) return; /* check end of data segment */
-
-  /* Get the number of directory entries contained in this SubIFD */
-  if (is_motorola) {
-    number_of_tags = GETJOCTET(data[offset]);
-    number_of_tags <<= 8;
-    number_of_tags += GETJOCTET(data[offset+1]);
-  } else {
-    number_of_tags = GETJOCTET(data[offset+1]);
-    number_of_tags <<= 8;
-    number_of_tags += GETJOCTET(data[offset]);
-  }
-  if (number_of_tags < 2) return;
-  offset += 2;
-
-  /* Search for ExifImageWidth and ExifImageHeight Tags in this SubIFD */
-  do {
-    if (offset > length - 12) return; /* check end of data segment */
-    /* Get Tag number */
-    if (is_motorola) {
-      tagnum = GETJOCTET(data[offset]);
-      tagnum <<= 8;
-      tagnum += GETJOCTET(data[offset+1]);
-    } else {
-      tagnum = GETJOCTET(data[offset+1]);
-      tagnum <<= 8;
-      tagnum += GETJOCTET(data[offset]);
-    }
-    if (tagnum == 0xA002 || tagnum == 0xA003) {
-      if (tagnum == 0xA002)
-	new_value = new_width; /* ExifImageWidth Tag */
-      else
-	new_value = new_height; /* ExifImageHeight Tag */
-      if (is_motorola) {
-	data[offset+2] = 0; /* Format = unsigned long (4 octets) */
-	data[offset+3] = 4;
-	data[offset+4] = 0; /* Number Of Components = 1 */
-	data[offset+5] = 0;
-	data[offset+6] = 0;
-	data[offset+7] = 1;
-	data[offset+8] = 0;
-	data[offset+9] = 0;
-	data[offset+10] = (JOCTET)((new_value >> 8) & 0xFF);
-	data[offset+11] = (JOCTET)(new_value & 0xFF);
-      } else {
-	data[offset+2] = 4; /* Format = unsigned long (4 octets) */
-	data[offset+3] = 0;
-	data[offset+4] = 1; /* Number Of Components = 1 */
-	data[offset+5] = 0;
-	data[offset+6] = 0;
-	data[offset+7] = 0;
-	data[offset+8] = (JOCTET)(new_value & 0xFF);
-	data[offset+9] = (JOCTET)((new_value >> 8) & 0xFF);
-	data[offset+10] = 0;
-	data[offset+11] = 0;
-      }
-    }
-    offset += 12;
-  } while (--number_of_tags);
-}
-
-
-/* Adjust output image parameters as needed.
- *
- * This must be called after jpeg_copy_critical_parameters()
- * and before jpeg_write_coefficients().
- *
- * The return value is the set of virtual coefficient arrays to be written
- * (either the ones allocated by jtransform_request_workspace, or the
- * original source data arrays).  The caller will need to pass this value
- * to jpeg_write_coefficients().
- */
-
-GLOBAL(jvirt_barray_ptr *)
-jtransform_adjust_parameters (j_decompress_ptr srcinfo,
-			      j_compress_ptr dstinfo,
-			      jvirt_barray_ptr *src_coef_arrays,
-			      jpeg_transform_info *info)
-{
-  /* If force-to-grayscale is requested, adjust destination parameters */
-  if (info->force_grayscale) {
-    /* First, ensure we have YCbCr or grayscale data, and that the source's
-     * Y channel is full resolution.  (No reasonable person would make Y
-     * be less than full resolution, so actually coping with that case
-     * isn't worth extra code space.  But we check it to avoid crashing.)
-     */
-    if (((dstinfo->jpeg_color_space == JCS_YCbCr &&
-	  dstinfo->num_components == 3) ||
-	 (dstinfo->jpeg_color_space == JCS_GRAYSCALE &&
-	  dstinfo->num_components == 1)) &&
-	srcinfo->comp_info[0].h_samp_factor == srcinfo->max_h_samp_factor &&
-	srcinfo->comp_info[0].v_samp_factor == srcinfo->max_v_samp_factor) {
-      /* We use jpeg_set_colorspace to make sure subsidiary settings get fixed
-       * properly.  Among other things, it sets the target h_samp_factor &
-       * v_samp_factor to 1, which typically won't match the source.
-       * We have to preserve the source's quantization table number, however.
-       */
-      int sv_quant_tbl_no = dstinfo->comp_info[0].quant_tbl_no;
-      jpeg_set_colorspace(dstinfo, JCS_GRAYSCALE);
-      dstinfo->comp_info[0].quant_tbl_no = sv_quant_tbl_no;
-    } else {
-      /* Sorry, can't do it */
-      ERREXIT(dstinfo, JERR_CONVERSION_NOTIMPL);
-    }
-  } else if (info->num_components == 1) {
-    /* For a single-component source, we force the destination sampling factors
-     * to 1x1, with or without force_grayscale.  This is useful because some
-     * decoders choke on grayscale images with other sampling factors.
-     */
-    dstinfo->comp_info[0].h_samp_factor = 1;
-    dstinfo->comp_info[0].v_samp_factor = 1;
-  }
-
-  /* Correct the destination's image dimensions as necessary
-   * for crop and rotate/flip operations.
-   */
-  dstinfo->image_width = info->output_width;
-  dstinfo->image_height = info->output_height;
-
-  /* Transpose destination image parameters */
-  switch (info->transform) {
-  case JXFORM_TRANSPOSE:
-  case JXFORM_TRANSVERSE:
-  case JXFORM_ROT_90:
-  case JXFORM_ROT_270:
-    transpose_critical_parameters(dstinfo);
-    break;
-  default:
-    break;
-  }
-
-  /* Adjust Exif properties */
-  if (srcinfo->marker_list != NULL &&
-      srcinfo->marker_list->marker == JPEG_APP0+1 &&
-      srcinfo->marker_list->data_length >= 6 &&
-      GETJOCTET(srcinfo->marker_list->data[0]) == 0x45 &&
-      GETJOCTET(srcinfo->marker_list->data[1]) == 0x78 &&
-      GETJOCTET(srcinfo->marker_list->data[2]) == 0x69 &&
-      GETJOCTET(srcinfo->marker_list->data[3]) == 0x66 &&
-      GETJOCTET(srcinfo->marker_list->data[4]) == 0 &&
-      GETJOCTET(srcinfo->marker_list->data[5]) == 0) {
-    /* Suppress output of JFIF marker */
-    dstinfo->write_JFIF_header = FALSE;
-    /* Adjust Exif image parameters */
-    if (dstinfo->image_width != srcinfo->image_width ||
-	dstinfo->image_height != srcinfo->image_height)
-      /* Align data segment to start of TIFF structure for parsing */
-      adjust_exif_parameters(srcinfo->marker_list->data + 6,
-	srcinfo->marker_list->data_length - 6,
-	dstinfo->image_width, dstinfo->image_height);
-  }
-
-  /* Return the appropriate output data set */
-  if (info->workspace_coef_arrays != NULL)
-    return info->workspace_coef_arrays;
-  return src_coef_arrays;
-}
-
-
-/* Execute the actual transformation, if any.
- *
- * This must be called *after* jpeg_write_coefficients, because it depends
- * on jpeg_write_coefficients to have computed subsidiary values such as
- * the per-component width and height fields in the destination object.
- *
- * Note that some transformations will modify the source data arrays!
- */
-
-GLOBAL(void)
-jtransform_execute_transform (j_decompress_ptr srcinfo,
-			      j_compress_ptr dstinfo,
-			      jvirt_barray_ptr *src_coef_arrays,
-			      jpeg_transform_info *info)
-{
-  jvirt_barray_ptr *dst_coef_arrays = info->workspace_coef_arrays;
-
-  /* Note: conditions tested here should match those in switch statement
-   * in jtransform_request_workspace()
-   */
-  switch (info->transform) {
-  case JXFORM_NONE:
-    if (info->x_crop_offset != 0 || info->y_crop_offset != 0)
-      do_crop(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
-	      src_coef_arrays, dst_coef_arrays);
-    break;
-  case JXFORM_FLIP_H:
-    if (info->y_crop_offset != 0)
-      do_flip_h(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
-		src_coef_arrays, dst_coef_arrays);
-    else
-      do_flip_h_no_crop(srcinfo, dstinfo, info->x_crop_offset,
-			src_coef_arrays);
-    break;
-  case JXFORM_FLIP_V:
-    do_flip_v(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
-	      src_coef_arrays, dst_coef_arrays);
-    break;
-  case JXFORM_TRANSPOSE:
-    do_transpose(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
-		 src_coef_arrays, dst_coef_arrays);
-    break;
-  case JXFORM_TRANSVERSE:
-    do_transverse(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
-		  src_coef_arrays, dst_coef_arrays);
-    break;
-  case JXFORM_ROT_90:
-    do_rot_90(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
-	      src_coef_arrays, dst_coef_arrays);
-    break;
-  case JXFORM_ROT_180:
-    do_rot_180(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
-	       src_coef_arrays, dst_coef_arrays);
-    break;
-  case JXFORM_ROT_270:
-    do_rot_270(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
-	       src_coef_arrays, dst_coef_arrays);
-    break;
-  }
-}
-
-/* jtransform_perfect_transform
- *
- * Determine whether lossless transformation is perfectly
- * possible for a specified image and transformation.
- *
- * Inputs:
- *   image_width, image_height: source image dimensions.
- *   MCU_width, MCU_height: pixel dimensions of MCU.
- *   transform: transformation identifier.
- * Parameter sources from initialized jpeg_struct
- * (after reading source header):
- *   image_width = cinfo.image_width
- *   image_height = cinfo.image_height
- *   MCU_width = cinfo.max_h_samp_factor * DCTSIZE
- *   MCU_height = cinfo.max_v_samp_factor * DCTSIZE
- * Result:
- *   TRUE = perfect transformation possible
- *   FALSE = perfect transformation not possible
- *           (may use custom action then)
- */
-
-GLOBAL(boolean)
-jtransform_perfect_transform(JDIMENSION image_width, JDIMENSION image_height,
-			     int MCU_width, int MCU_height,
-			     JXFORM_CODE transform)
-{
-  boolean result = TRUE; /* initialize TRUE */
-
-  switch (transform) {
-  case JXFORM_FLIP_H:
-  case JXFORM_ROT_270:
-    if (image_width % (JDIMENSION) MCU_width)
-      result = FALSE;
-    break;
-  case JXFORM_FLIP_V:
-  case JXFORM_ROT_90:
-    if (image_height % (JDIMENSION) MCU_height)
-      result = FALSE;
-    break;
-  case JXFORM_TRANSVERSE:
-  case JXFORM_ROT_180:
-    if (image_width % (JDIMENSION) MCU_width)
-      result = FALSE;
-    if (image_height % (JDIMENSION) MCU_height)
-      result = FALSE;
-    break;
-  default:
-    break;
-  }
-
-  return result;
-}
-
-#endif /* TRANSFORMS_SUPPORTED */
-
-
-/* Setup decompression object to save desired markers in memory.
- * This must be called before jpeg_read_header() to have the desired effect.
- */
-
-GLOBAL(void)
-jcopy_markers_setup (j_decompress_ptr srcinfo, JCOPY_OPTION option)
-{
-#ifdef SAVE_MARKERS_SUPPORTED
-  int m;
-
-  /* Save comments except under NONE option */
-  if (option != JCOPYOPT_NONE) {
-    jpeg_save_markers(srcinfo, JPEG_COM, 0xFFFF);
-  }
-  /* Save all types of APPn markers iff ALL option */
-  if (option == JCOPYOPT_ALL) {
-    for (m = 0; m < 16; m++)
-      jpeg_save_markers(srcinfo, JPEG_APP0 + m, 0xFFFF);
-  }
-#endif /* SAVE_MARKERS_SUPPORTED */
-}
-
-/* Copy markers saved in the given source object to the destination object.
- * This should be called just after jpeg_start_compress() or
- * jpeg_write_coefficients().
- * Note that those routines will have written the SOI, and also the
- * JFIF APP0 or Adobe APP14 markers if selected.
- */
-
-GLOBAL(void)
-jcopy_markers_execute (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-		       JCOPY_OPTION option)
-{
-  jpeg_saved_marker_ptr marker;
-
-  /* In the current implementation, we don't actually need to examine the
-   * option flag here; we just copy everything that got saved.
-   * But to avoid confusion, we do not output JFIF and Adobe APP14 markers
-   * if the encoder library already wrote one.
-   */
-  for (marker = srcinfo->marker_list; marker != NULL; marker = marker->next) {
-    if (dstinfo->write_JFIF_header &&
-	marker->marker == JPEG_APP0 &&
-	marker->data_length >= 5 &&
-	GETJOCTET(marker->data[0]) == 0x4A &&
-	GETJOCTET(marker->data[1]) == 0x46 &&
-	GETJOCTET(marker->data[2]) == 0x49 &&
-	GETJOCTET(marker->data[3]) == 0x46 &&
-	GETJOCTET(marker->data[4]) == 0)
-      continue;			/* reject duplicate JFIF */
-    if (dstinfo->write_Adobe_marker &&
-	marker->marker == JPEG_APP0+14 &&
-	marker->data_length >= 5 &&
-	GETJOCTET(marker->data[0]) == 0x41 &&
-	GETJOCTET(marker->data[1]) == 0x64 &&
-	GETJOCTET(marker->data[2]) == 0x6F &&
-	GETJOCTET(marker->data[3]) == 0x62 &&
-	GETJOCTET(marker->data[4]) == 0x65)
-      continue;			/* reject duplicate Adobe */
-#ifdef NEED_FAR_POINTERS
-    /* We could use jpeg_write_marker if the data weren't FAR... */
-    {
-      unsigned int i;
-      jpeg_write_m_header(dstinfo, marker->marker, marker->data_length);
-      for (i = 0; i < marker->data_length; i++)
-	jpeg_write_m_byte(dstinfo, marker->data[i]);
-    }
-#else
-    jpeg_write_marker(dstinfo, marker->marker,
-		      marker->data, marker->data_length);
-#endif
-  }
-}
diff --git a/3rdparty/libjpeg/transupp.h b/3rdparty/libjpeg/transupp.h
deleted file mode 100644
index 981b1cee7d..0000000000
--- a/3rdparty/libjpeg/transupp.h
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
- * transupp.h
- *
- * Copyright (C) 1997-2001, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file contains declarations for image transformation routines and
- * other utility code used by the jpegtran sample application.  These are
- * NOT part of the core JPEG library.  But we keep these routines separate
- * from jpegtran.c to ease the task of maintaining jpegtran-like programs
- * that have other user interfaces.
- *
- * NOTE: all the routines declared here have very specific requirements
- * about when they are to be executed during the reading and writing of the
- * source and destination files.  See the comments in transupp.c, or see
- * jpegtran.c for an example of correct usage.
- */
-
-/* If you happen not to want the image transform support, disable it here */
-#ifndef TRANSFORMS_SUPPORTED
-#define TRANSFORMS_SUPPORTED 1		/* 0 disables transform code */
-#endif
-
-/*
- * Although rotating and flipping data expressed as DCT coefficients is not
- * hard, there is an asymmetry in the JPEG format specification for images
- * whose dimensions aren't multiples of the iMCU size.  The right and bottom
- * image edges are padded out to the next iMCU boundary with junk data; but
- * no padding is possible at the top and left edges.  If we were to flip
- * the whole image including the pad data, then pad garbage would become
- * visible at the top and/or left, and real pixels would disappear into the
- * pad margins --- perhaps permanently, since encoders & decoders may not
- * bother to preserve DCT blocks that appear to be completely outside the
- * nominal image area.  So, we have to exclude any partial iMCUs from the
- * basic transformation.
- *
- * Transpose is the only transformation that can handle partial iMCUs at the
- * right and bottom edges completely cleanly.  flip_h can flip partial iMCUs
- * at the bottom, but leaves any partial iMCUs at the right edge untouched.
- * Similarly flip_v leaves any partial iMCUs at the bottom edge untouched.
- * The other transforms are defined as combinations of these basic transforms
- * and process edge blocks in a way that preserves the equivalence.
- *
- * The "trim" option causes untransformable partial iMCUs to be dropped;
- * this is not strictly lossless, but it usually gives the best-looking
- * result for odd-size images.  Note that when this option is active,
- * the expected mathematical equivalences between the transforms may not hold.
- * (For example, -rot 270 -trim trims only the bottom edge, but -rot 90 -trim
- * followed by -rot 180 -trim trims both edges.)
- *
- * We also offer a lossless-crop option, which discards data outside a given
- * image region but losslessly preserves what is inside.  Like the rotate and
- * flip transforms, lossless crop is restricted by the JPEG format: the upper
- * left corner of the selected region must fall on an iMCU boundary.  If this
- * does not hold for the given crop parameters, we silently move the upper left
- * corner up and/or left to make it so, simultaneously increasing the region
- * dimensions to keep the lower right crop corner unchanged.  (Thus, the
- * output image covers at least the requested region, but may cover more.)
- *
- * If both crop and a rotate/flip transform are requested, the crop is applied
- * last --- that is, the crop region is specified in terms of the destination
- * image.
- *
- * We also offer a "force to grayscale" option, which simply discards the
- * chrominance channels of a YCbCr image.  This is lossless in the sense that
- * the luminance channel is preserved exactly.  It's not the same kind of
- * thing as the rotate/flip transformations, but it's convenient to handle it
- * as part of this package, mainly because the transformation routines have to
- * be aware of the option to know how many components to work on.
- */
-
-
-/* Short forms of external names for systems with brain-damaged linkers. */
-
-#ifdef NEED_SHORT_EXTERNAL_NAMES
-#define jtransform_parse_crop_spec	jTrParCrop
-#define jtransform_request_workspace	jTrRequest
-#define jtransform_adjust_parameters	jTrAdjust
-#define jtransform_execute_transform	jTrExec
-#define jtransform_perfect_transform	jTrPerfect
-#define jcopy_markers_setup		jCMrkSetup
-#define jcopy_markers_execute		jCMrkExec
-#endif /* NEED_SHORT_EXTERNAL_NAMES */
-
-
-/*
- * Codes for supported types of image transformations.
- */
-
-typedef enum {
-	JXFORM_NONE,		/* no transformation */
-	JXFORM_FLIP_H,		/* horizontal flip */
-	JXFORM_FLIP_V,		/* vertical flip */
-	JXFORM_TRANSPOSE,	/* transpose across UL-to-LR axis */
-	JXFORM_TRANSVERSE,	/* transpose across UR-to-LL axis */
-	JXFORM_ROT_90,		/* 90-degree clockwise rotation */
-	JXFORM_ROT_180,		/* 180-degree rotation */
-	JXFORM_ROT_270		/* 270-degree clockwise (or 90 ccw) */
-} JXFORM_CODE;
-
-/*
- * Codes for crop parameters, which can individually be unspecified,
- * positive, or negative.  (Negative width or height makes no sense, though.)
- */
-
-typedef enum {
-	JCROP_UNSET,
-	JCROP_POS,
-	JCROP_NEG
-} JCROP_CODE;
-
-/*
- * Transform parameters struct.
- * NB: application must not change any elements of this struct after
- * calling jtransform_request_workspace.
- */
-
-typedef struct {
-  /* Options: set by caller */
-  JXFORM_CODE transform;	/* image transform operator */
-  boolean perfect;		/* if TRUE, fail if partial MCUs are requested */
-  boolean trim;			/* if TRUE, trim partial MCUs as needed */
-  boolean force_grayscale;	/* if TRUE, convert color image to grayscale */
-  boolean crop;			/* if TRUE, crop source image */
-
-  /* Crop parameters: application need not set these unless crop is TRUE.
-   * These can be filled in by jtransform_parse_crop_spec().
-   */
-  JDIMENSION crop_width;	/* Width of selected region */
-  JCROP_CODE crop_width_set;
-  JDIMENSION crop_height;	/* Height of selected region */
-  JCROP_CODE crop_height_set;
-  JDIMENSION crop_xoffset;	/* X offset of selected region */
-  JCROP_CODE crop_xoffset_set;	/* (negative measures from right edge) */
-  JDIMENSION crop_yoffset;	/* Y offset of selected region */
-  JCROP_CODE crop_yoffset_set;	/* (negative measures from bottom edge) */
-
-  /* Internal workspace: caller should not touch these */
-  int num_components;		/* # of components in workspace */
-  jvirt_barray_ptr * workspace_coef_arrays; /* workspace for transformations */
-  JDIMENSION output_width;	/* cropped destination dimensions */
-  JDIMENSION output_height;
-  JDIMENSION x_crop_offset;	/* destination crop offsets measured in iMCUs */
-  JDIMENSION y_crop_offset;
-  int max_h_samp_factor;	/* destination iMCU size */
-  int max_v_samp_factor;
-} jpeg_transform_info;
-
-
-#if TRANSFORMS_SUPPORTED
-
-/* Parse a crop specification (written in X11 geometry style) */
-EXTERN(boolean) jtransform_parse_crop_spec
-	JPP((jpeg_transform_info *info, const char *spec));
-/* Request any required workspace */
-EXTERN(void) jtransform_request_workspace
-	JPP((j_decompress_ptr srcinfo, jpeg_transform_info *info));
-/* Adjust output image parameters */
-EXTERN(jvirt_barray_ptr *) jtransform_adjust_parameters
-	JPP((j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-	     jvirt_barray_ptr *src_coef_arrays,
-	     jpeg_transform_info *info));
-/* Execute the actual transformation, if any */
-EXTERN(void) jtransform_execute_transform
-	JPP((j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-	     jvirt_barray_ptr *src_coef_arrays,
-	     jpeg_transform_info *info));
-/* Determine whether lossless transformation is perfectly
- * possible for a specified image and transformation.
- */
-EXTERN(boolean) jtransform_perfect_transform
-	JPP((JDIMENSION image_width, JDIMENSION image_height,
-	     int MCU_width, int MCU_height,
-	     JXFORM_CODE transform));
-
-/* jtransform_execute_transform used to be called
- * jtransform_execute_transformation, but some compilers complain about
- * routine names that long.  This macro is here to avoid breaking any
- * old source code that uses the original name...
- */
-#define jtransform_execute_transformation	jtransform_execute_transform
-
-#endif /* TRANSFORMS_SUPPORTED */
-
-
-/*
- * Support for copying optional markers from source to destination file.
- */
-
-typedef enum {
-	JCOPYOPT_NONE,		/* copy no optional markers */
-	JCOPYOPT_COMMENTS,	/* copy only comment (COM) markers */
-	JCOPYOPT_ALL		/* copy all optional markers */
-} JCOPY_OPTION;
-
-#define JCOPYOPT_DEFAULT  JCOPYOPT_COMMENTS	/* recommended default */
-
-/* Setup decompression object to save desired markers in memory */
-EXTERN(void) jcopy_markers_setup
-	JPP((j_decompress_ptr srcinfo, JCOPY_OPTION option));
-/* Copy markers saved in the given source object to the destination object */
-EXTERN(void) jcopy_markers_execute
-	JPP((j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-	     JCOPY_OPTION option));