Merge branch 4.x

pull/22408/head
Alexander Alekhin 3 years ago
commit c25f776151
  1. 2
      3rdparty/carotene/CMakeLists.txt
  2. 2
      3rdparty/carotene/hal/CMakeLists.txt
  3. 10
      3rdparty/ffmpeg/ffmpeg.cmake
  4. 1
      3rdparty/ittnotify/CMakeLists.txt
  5. 19
      3rdparty/libjpeg-turbo/CMakeLists.txt
  6. 2
      3rdparty/libjpeg-turbo/LICENSE.md
  7. 6
      3rdparty/libjpeg-turbo/src/jcapimin.c
  8. 12
      3rdparty/libjpeg-turbo/src/jcarith.c
  9. 22
      3rdparty/libjpeg-turbo/src/jchuff.c
  10. 10
      3rdparty/libjpeg-turbo/src/jcphuff.c
  11. 8
      3rdparty/libjpeg-turbo/src/jcprepct.c
  12. 6
      3rdparty/libjpeg-turbo/src/jctrans.c
  13. 6
      3rdparty/libjpeg-turbo/src/jdapimin.c
  14. 8
      3rdparty/libjpeg-turbo/src/jdapistd.c
  15. 22
      3rdparty/libjpeg-turbo/src/jdarith.c
  16. 13
      3rdparty/libjpeg-turbo/src/jdatadst.c
  17. 4
      3rdparty/libjpeg-turbo/src/jdatasrc.c
  18. 4
      3rdparty/libjpeg-turbo/src/jddctmgr.c
  19. 4
      3rdparty/libjpeg-turbo/src/jdicc.c
  20. 4
      3rdparty/libjpeg-turbo/src/jdinput.c
  21. 8
      3rdparty/libjpeg-turbo/src/jdmarker.c
  22. 12
      3rdparty/libjpeg-turbo/src/jdmaster.c
  23. 10
      3rdparty/libjpeg-turbo/src/jdphuff.c
  24. 16
      3rdparty/libjpeg-turbo/src/jerror.c
  25. 6
      3rdparty/libjpeg-turbo/src/jerror.h
  26. 133
      3rdparty/libjpeg-turbo/src/jinclude.h
  27. 16
      3rdparty/libjpeg-turbo/src/jmemmgr.c
  28. 5
      3rdparty/libjpeg-turbo/src/jmemnobs.c
  29. 4
      3rdparty/libjpeg-turbo/src/jmorecfg.h
  30. 9
      3rdparty/libjpeg-turbo/src/jpegint.h
  31. 9
      3rdparty/libjpeg-turbo/src/jstdhuff.c
  32. 10
      3rdparty/libjpeg-turbo/src/jutils.c
  33. 54
      3rdparty/libjpeg-turbo/src/jversion.h.in
  34. 540
      3rdparty/libjpeg-turbo/src/simd/CMakeLists.txt
  35. 148
      3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jccolext-neon.c
  36. 334
      3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jchuff-neon.c
  37. 980
      3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jsimd.c
  38. 1200
      3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jsimd_neon.S
  39. 316
      3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jccolext-neon.c
  40. 411
      3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jchuff-neon.c
  41. 1058
      3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jsimd.c
  42. 2254
      3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jsimd_neon.S
  43. 28
      3rdparty/libjpeg-turbo/src/simd/arm/align.h
  44. 160
      3rdparty/libjpeg-turbo/src/simd/arm/jccolor-neon.c
  45. 120
      3rdparty/libjpeg-turbo/src/simd/arm/jcgray-neon.c
  46. 106
      3rdparty/libjpeg-turbo/src/simd/arm/jcgryext-neon.c
  47. 131
      3rdparty/libjpeg-turbo/src/simd/arm/jchuff.h
  48. 622
      3rdparty/libjpeg-turbo/src/simd/arm/jcphuff-neon.c
  49. 192
      3rdparty/libjpeg-turbo/src/simd/arm/jcsample-neon.c
  50. 374
      3rdparty/libjpeg-turbo/src/simd/arm/jdcolext-neon.c
  51. 142
      3rdparty/libjpeg-turbo/src/simd/arm/jdcolor-neon.c
  52. 145
      3rdparty/libjpeg-turbo/src/simd/arm/jdmerge-neon.c
  53. 723
      3rdparty/libjpeg-turbo/src/simd/arm/jdmrgext-neon.c
  54. 569
      3rdparty/libjpeg-turbo/src/simd/arm/jdsample-neon.c
  55. 214
      3rdparty/libjpeg-turbo/src/simd/arm/jfdctfst-neon.c
  56. 376
      3rdparty/libjpeg-turbo/src/simd/arm/jfdctint-neon.c
  57. 472
      3rdparty/libjpeg-turbo/src/simd/arm/jidctfst-neon.c
  58. 802
      3rdparty/libjpeg-turbo/src/simd/arm/jidctint-neon.c
  59. 486
      3rdparty/libjpeg-turbo/src/simd/arm/jidctred-neon.c
  60. 193
      3rdparty/libjpeg-turbo/src/simd/arm/jquanti-neon.c
  61. 37
      3rdparty/libjpeg-turbo/src/simd/arm/neon-compat.h.in
  62. 578
      3rdparty/libjpeg-turbo/src/simd/i386/jccolext-avx2.asm
  63. 476
      3rdparty/libjpeg-turbo/src/simd/i386/jccolext-mmx.asm
  64. 503
      3rdparty/libjpeg-turbo/src/simd/i386/jccolext-sse2.asm
  65. 121
      3rdparty/libjpeg-turbo/src/simd/i386/jccolor-avx2.asm
  66. 121
      3rdparty/libjpeg-turbo/src/simd/i386/jccolor-mmx.asm
  67. 120
      3rdparty/libjpeg-turbo/src/simd/i386/jccolor-sse2.asm
  68. 113
      3rdparty/libjpeg-turbo/src/simd/i386/jcgray-avx2.asm
  69. 113
      3rdparty/libjpeg-turbo/src/simd/i386/jcgray-mmx.asm
  70. 112
      3rdparty/libjpeg-turbo/src/simd/i386/jcgray-sse2.asm
  71. 457
      3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-avx2.asm
  72. 355
      3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-mmx.asm
  73. 382
      3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-sse2.asm
  74. 761
      3rdparty/libjpeg-turbo/src/simd/i386/jchuff-sse2.asm
  75. 662
      3rdparty/libjpeg-turbo/src/simd/i386/jcphuff-sse2.asm
  76. 388
      3rdparty/libjpeg-turbo/src/simd/i386/jcsample-avx2.asm
  77. 324
      3rdparty/libjpeg-turbo/src/simd/i386/jcsample-mmx.asm
  78. 351
      3rdparty/libjpeg-turbo/src/simd/i386/jcsample-sse2.asm
  79. 515
      3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-avx2.asm
  80. 404
      3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-mmx.asm
  81. 458
      3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-sse2.asm
  82. 118
      3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-avx2.asm
  83. 117
      3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-mmx.asm
  84. 117
      3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-sse2.asm
  85. 136
      3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-avx2.asm
  86. 123
      3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-mmx.asm
  87. 135
      3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-sse2.asm
  88. 575
      3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-avx2.asm
  89. 460
      3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-mmx.asm
  90. 517
      3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-sse2.asm
  91. 760
      3rdparty/libjpeg-turbo/src/simd/i386/jdsample-avx2.asm
  92. 731
      3rdparty/libjpeg-turbo/src/simd/i386/jdsample-mmx.asm
  93. 724
      3rdparty/libjpeg-turbo/src/simd/i386/jdsample-sse2.asm
  94. 318
      3rdparty/libjpeg-turbo/src/simd/i386/jfdctflt-3dn.asm
  95. 369
      3rdparty/libjpeg-turbo/src/simd/i386/jfdctflt-sse.asm
  96. 395
      3rdparty/libjpeg-turbo/src/simd/i386/jfdctfst-mmx.asm
  97. 403
      3rdparty/libjpeg-turbo/src/simd/i386/jfdctfst-sse2.asm
  98. 331
      3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-avx2.asm
  99. 620
      3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-mmx.asm
  100. 633
      3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-sse2.asm
  101. Some files were not shown because too many files have changed in this diff Show More

@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 2.8.11 FATAL_ERROR)
cmake_minimum_required(VERSION ${MIN_VER_CMAKE} FATAL_ERROR)
project(Carotene)

@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 2.8.8 FATAL_ERROR)
cmake_minimum_required(VERSION ${MIN_VER_CMAKE} FATAL_ERROR)
include(CheckCCompilerFlag)
include(CheckCXXCompilerFlag)

@ -1,8 +1,8 @@
# Binaries branch name: ffmpeg/master_20211005
# Binaries were created for OpenCV: 672399c751c431bbe52818b33fd3ca17b51e0e16
ocv_update(FFMPEG_BINARIES_COMMIT "40b4666d1aa374205fd61373496e15d92ecd5313")
ocv_update(FFMPEG_FILE_HASH_BIN32 "c2f9a897d464a2dce2286f8067ad9d90")
ocv_update(FFMPEG_FILE_HASH_BIN64 "878a4e8fe5a4d68f18c9cdde543b9ead")
# Binaries branch name: ffmpeg/4.x_20220524
# Binaries were created for OpenCV: d6e9616256b46bd59be0a93d397f6ab958d39cd2
ocv_update(FFMPEG_BINARIES_COMMIT "65ec04d4573dcdfa4531f0b9e67f35d8ffff873e")
ocv_update(FFMPEG_FILE_HASH_BIN32 "5573e2262ad1298e603122b7759fc2f6")
ocv_update(FFMPEG_FILE_HASH_BIN64 "5f9e2b2e04c15f080f40e844de80c867")
ocv_update(FFMPEG_FILE_HASH_CMAKE "8862c87496e2e8c375965e1277dee1c7")
function(download_win_ffmpeg script_var)

@ -54,6 +54,7 @@ set_target_properties(${ITT_LIBRARY} PROPERTIES
)
ocv_warnings_disable(CMAKE_C_FLAGS -Wundef -Wsign-compare)
ocv_warnings_disable(CMAKE_C_FLAGS -Wstrict-prototypes) # clang15
if(ENABLE_SOLUTION_FOLDERS)
set_target_properties(${ITT_LIBRARY} PROPERTIES FOLDER "3rdparty")

@ -4,9 +4,9 @@ ocv_warnings_disable(CMAKE_C_FLAGS -Wunused-parameter -Wsign-compare -Wshorten-6
set(VERSION_MAJOR 2)
set(VERSION_MINOR 1)
set(VERSION_REVISION 2)
set(VERSION_REVISION 3)
set(VERSION ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_REVISION})
set(LIBJPEG_TURBO_VERSION_NUMBER 2001002)
set(LIBJPEG_TURBO_VERSION_NUMBER 2001003)
string(TIMESTAMP BUILD "opencv-${OPENCV_VERSION}-libjpeg-turbo")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
@ -79,14 +79,13 @@ configure_file(jconfigint.h.in jconfigint.h)
include_directories(${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/src)
set(JPEG_SOURCES
jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c jcicc.c
jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c jcphuff.c
jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c jdatadst.c jdatasrc.c
jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c jdicc.c jdinput.c jdmainct.c jdmarker.c
jdmaster.c jdmerge.c jdphuff.c jdpostct.c jdsample.c jdtrans.c jerror.c
jfdctflt.c jfdctfst.c jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c
jquant1.c jquant2.c jutils.c jmemmgr.c jmemnobs.c)
set(JPEG_SOURCES jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c
jcicc.c jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c
jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c jdatadst.c
jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c jdicc.c jdinput.c
jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c jdpostct.c jdsample.c
jdtrans.c jerror.c jfdctflt.c jfdctfst.c jfdctint.c jidctflt.c jidctfst.c
jidctint.c jidctred.c jquant1.c jquant2.c jutils.c jmemmgr.c jmemnobs.c)
if(WITH_ARITH_ENC OR WITH_ARITH_DEC)
set(JPEG_SOURCES ${JPEG_SOURCES} jaricom.c)

@ -91,7 +91,7 @@ best of our understanding.
The Modified (3-clause) BSD License
===================================
Copyright (C)2009-2021 D. R. Commander. All Rights Reserved.<br>
Copyright (C)2009-2022 D. R. Commander. All Rights Reserved.<br>
Copyright (C)2015 Viktor Szathmáry. All Rights Reserved.
Redistribution and use in source and binary forms, with or without

@ -4,8 +4,8 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1994-1998, Thomas G. Lane.
* Modified 2003-2010 by Guido Vollbeding.
* It was modified by The libjpeg-turbo Project to include only code relevant
* to libjpeg-turbo.
* libjpeg-turbo Modifications:
* Copyright (C) 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@ -52,7 +52,7 @@ jpeg_CreateCompress(j_compress_ptr cinfo, int version, size_t structsize)
{
struct jpeg_error_mgr *err = cinfo->err;
void *client_data = cinfo->client_data; /* ignore Purify complaint here */
MEMZERO(cinfo, sizeof(struct jpeg_compress_struct));
memset(cinfo, 0, sizeof(struct jpeg_compress_struct));
cinfo->err = err;
cinfo->client_data = client_data;
}

@ -4,7 +4,7 @@
* This file was part of the Independent JPEG Group's software:
* Developed 1997-2009 by Guido Vollbeding.
* libjpeg-turbo Modifications:
* Copyright (C) 2015, 2018, D. R. Commander.
* Copyright (C) 2015, 2018, 2021-2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@ -338,14 +338,14 @@ emit_restart(j_compress_ptr cinfo, int restart_num)
compptr = cinfo->cur_comp_info[ci];
/* DC needs no table for refinement scan */
if (cinfo->progressive_mode == 0 || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
MEMZERO(entropy->dc_stats[compptr->dc_tbl_no], DC_STAT_BINS);
memset(entropy->dc_stats[compptr->dc_tbl_no], 0, DC_STAT_BINS);
/* Reset DC predictions to 0 */
entropy->last_dc_val[ci] = 0;
entropy->dc_context[ci] = 0;
}
/* AC needs no table when not present */
if (cinfo->progressive_mode == 0 || cinfo->Se) {
MEMZERO(entropy->ac_stats[compptr->ac_tbl_no], AC_STAT_BINS);
memset(entropy->ac_stats[compptr->ac_tbl_no], 0, AC_STAT_BINS);
}
}
@ -836,7 +836,7 @@ start_pass(j_compress_ptr cinfo, boolean gather_statistics)
* We are fully adaptive here and need no extra
* statistics gathering pass!
*/
ERREXIT(cinfo, JERR_NOT_COMPILED);
ERREXIT(cinfo, JERR_NOTIMPL);
/* We assume jcmaster.c already validated the progressive scan parameters. */
@ -867,7 +867,7 @@ start_pass(j_compress_ptr cinfo, boolean gather_statistics)
if (entropy->dc_stats[tbl] == NULL)
entropy->dc_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small)
((j_common_ptr)cinfo, JPOOL_IMAGE, DC_STAT_BINS);
MEMZERO(entropy->dc_stats[tbl], DC_STAT_BINS);
memset(entropy->dc_stats[tbl], 0, DC_STAT_BINS);
/* Initialize DC predictions to 0 */
entropy->last_dc_val[ci] = 0;
entropy->dc_context[ci] = 0;
@ -880,7 +880,7 @@ start_pass(j_compress_ptr cinfo, boolean gather_statistics)
if (entropy->ac_stats[tbl] == NULL)
entropy->ac_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small)
((j_common_ptr)cinfo, JPOOL_IMAGE, AC_STAT_BINS);
MEMZERO(entropy->ac_stats[tbl], AC_STAT_BINS);
memset(entropy->ac_stats[tbl], 0, AC_STAT_BINS);
#ifdef CALCULATE_SPECTRAL_CONDITIONING
if (cinfo->progressive_mode)
/* Section G.1.3.2: Set appropriate arithmetic conditioning value Kx */

@ -4,7 +4,7 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1997, Thomas G. Lane.
* libjpeg-turbo Modifications:
* Copyright (C) 2009-2011, 2014-2016, 2018-2021, D. R. Commander.
* Copyright (C) 2009-2011, 2014-2016, 2018-2022, D. R. Commander.
* Copyright (C) 2015, Matthieu Darbois.
* Copyright (C) 2018, Matthias Räncker.
* Copyright (C) 2020, Arm Limited.
@ -200,12 +200,12 @@ start_pass_huff(j_compress_ptr cinfo, boolean gather_statistics)
entropy->dc_count_ptrs[dctbl] = (long *)
(*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
257 * sizeof(long));
MEMZERO(entropy->dc_count_ptrs[dctbl], 257 * sizeof(long));
memset(entropy->dc_count_ptrs[dctbl], 0, 257 * sizeof(long));
if (entropy->ac_count_ptrs[actbl] == NULL)
entropy->ac_count_ptrs[actbl] = (long *)
(*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
257 * sizeof(long));
MEMZERO(entropy->ac_count_ptrs[actbl], 257 * sizeof(long));
memset(entropy->ac_count_ptrs[actbl], 0, 257 * sizeof(long));
#endif
} else {
/* Compute derived values for Huffman tables */
@ -315,8 +315,8 @@ jpeg_make_c_derived_tbl(j_compress_ptr cinfo, boolean isDC, int tblno,
* this lets us detect duplicate VAL entries here, and later
* allows emit_bits to detect any attempt to emit such symbols.
*/
MEMZERO(dtbl->ehufco, sizeof(dtbl->ehufco));
MEMZERO(dtbl->ehufsi, sizeof(dtbl->ehufsi));
memset(dtbl->ehufco, 0, sizeof(dtbl->ehufco));
memset(dtbl->ehufsi, 0, sizeof(dtbl->ehufsi));
/* This is also a convenient place to check for out-of-range
* and duplicated VAL entries. We allow 0..255 for AC symbols
@ -478,7 +478,7 @@ dump_buffer(working_state *state)
buffer = _buffer; \
while (bytes > 0) { \
bytestocopy = MIN(bytes, state->free_in_buffer); \
MEMCOPY(state->next_output_byte, buffer, bytestocopy); \
memcpy(state->next_output_byte, buffer, bytestocopy); \
state->next_output_byte += bytestocopy; \
buffer += bytestocopy; \
state->free_in_buffer -= bytestocopy; \
@ -941,8 +941,8 @@ jpeg_gen_optimal_table(j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
/* This algorithm is explained in section K.2 of the JPEG standard */
MEMZERO(bits, sizeof(bits));
MEMZERO(codesize, sizeof(codesize));
memset(bits, 0, sizeof(bits));
memset(codesize, 0, sizeof(codesize));
for (i = 0; i < 257; i++)
others[i] = -1; /* init links to empty */
@ -1044,7 +1044,7 @@ jpeg_gen_optimal_table(j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
bits[i]--;
/* Return final symbol counts (only for lengths 0..16) */
MEMCOPY(htbl->bits, bits, sizeof(htbl->bits));
memcpy(htbl->bits, bits, sizeof(htbl->bits));
/* Return a list of the symbols sorted by code length */
/* It's not real clear to me why we don't need to consider the codelength
@ -1083,8 +1083,8 @@ finish_pass_gather(j_compress_ptr cinfo)
/* It's important not to apply jpeg_gen_optimal_table more than once
* per table, because it clobbers the input frequency counts!
*/
MEMZERO(did_dc, sizeof(did_dc));
MEMZERO(did_ac, sizeof(did_ac));
memset(did_dc, 0, sizeof(did_dc));
memset(did_ac, 0, sizeof(did_ac));
for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
compptr = cinfo->cur_comp_info[ci];

@ -4,7 +4,7 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1995-1997, Thomas G. Lane.
* libjpeg-turbo Modifications:
* Copyright (C) 2011, 2015, 2018, 2021, D. R. Commander.
* Copyright (C) 2011, 2015, 2018, 2021-2022, D. R. Commander.
* Copyright (C) 2016, 2018, Matthieu Darbois.
* Copyright (C) 2020, Arm Limited.
* Copyright (C) 2021, Alex Richardson.
@ -275,7 +275,7 @@ start_pass_phuff(j_compress_ptr cinfo, boolean gather_statistics)
entropy->count_ptrs[tbl] = (long *)
(*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
257 * sizeof(long));
MEMZERO(entropy->count_ptrs[tbl], 257 * sizeof(long));
memset(entropy->count_ptrs[tbl], 0, 257 * sizeof(long));
} else {
/* Compute derived values for Huffman table */
/* We may do this more than once for a table, but it's not expensive */
@ -584,8 +584,8 @@ encode_mcu_DC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
continue; \
/* For a negative coef, want temp2 = bitwise complement of abs(coef) */ \
temp2 ^= temp; \
values[k] = temp; \
values[k + DCTSIZE2] = temp2; \
values[k] = (JCOEF)temp; \
values[k + DCTSIZE2] = (JCOEF)temp2; \
zerobits |= ((size_t)1U) << k; \
} \
}
@ -1062,7 +1062,7 @@ finish_pass_gather_phuff(j_compress_ptr cinfo)
/* It's important not to apply jpeg_gen_optimal_table more than once
* per table, because it clobbers the input frequency counts!
*/
MEMZERO(did, sizeof(did));
memset(did, 0, sizeof(did));
for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
compptr = cinfo->cur_comp_info[ci];

@ -3,8 +3,8 @@
*
* This file is part of the Independent JPEG Group's software:
* Copyright (C) 1994-1996, Thomas G. Lane.
* It was modified by The libjpeg-turbo Project to include only code relevant
* to libjpeg-turbo.
* libjpeg-turbo Modifications:
* Copyright (C) 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@ -289,8 +289,8 @@ create_context_buffer(j_compress_ptr cinfo)
cinfo->max_h_samp_factor) / compptr->h_samp_factor),
(JDIMENSION)(3 * rgroup_height));
/* Copy true buffer row pointers into the middle of the fake row array */
MEMCOPY(fake_buffer + rgroup_height, true_buffer,
3 * rgroup_height * sizeof(JSAMPROW));
memcpy(fake_buffer + rgroup_height, true_buffer,
3 * rgroup_height * sizeof(JSAMPROW));
/* Fill in the above and below wraparound pointers */
for (i = 0; i < rgroup_height; i++) {
fake_buffer[i] = true_buffer[2 * rgroup_height + i];

@ -5,7 +5,7 @@
* Copyright (C) 1995-1998, Thomas G. Lane.
* Modified 2000-2009 by Guido Vollbeding.
* libjpeg-turbo Modifications:
* Copyright (C) 2020, D. R. Commander.
* Copyright (C) 2020, 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@ -100,8 +100,8 @@ jpeg_copy_critical_parameters(j_decompress_ptr srcinfo, j_compress_ptr dstinfo)
qtblptr = &dstinfo->quant_tbl_ptrs[tblno];
if (*qtblptr == NULL)
*qtblptr = jpeg_alloc_quant_table((j_common_ptr)dstinfo);
MEMCOPY((*qtblptr)->quantval, srcinfo->quant_tbl_ptrs[tblno]->quantval,
sizeof((*qtblptr)->quantval));
memcpy((*qtblptr)->quantval, srcinfo->quant_tbl_ptrs[tblno]->quantval,
sizeof((*qtblptr)->quantval));
(*qtblptr)->sent_table = FALSE;
}
}

@ -4,7 +4,7 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1994-1998, Thomas G. Lane.
* libjpeg-turbo Modifications:
* Copyright (C) 2016, D. R. Commander.
* Copyright (C) 2016, 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@ -53,7 +53,7 @@ jpeg_CreateDecompress(j_decompress_ptr cinfo, int version, size_t structsize)
{
struct jpeg_error_mgr *err = cinfo->err;
void *client_data = cinfo->client_data; /* ignore Purify complaint here */
MEMZERO(cinfo, sizeof(struct jpeg_decompress_struct));
memset(cinfo, 0, sizeof(struct jpeg_decompress_struct));
cinfo->err = err;
cinfo->client_data = client_data;
}
@ -92,7 +92,7 @@ jpeg_CreateDecompress(j_decompress_ptr cinfo, int version, size_t structsize)
cinfo->master = (struct jpeg_decomp_master *)
(*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
sizeof(my_decomp_master));
MEMZERO(cinfo->master, sizeof(my_decomp_master));
memset(cinfo->master, 0, sizeof(my_decomp_master));
}

@ -4,7 +4,7 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1994-1996, Thomas G. Lane.
* libjpeg-turbo Modifications:
* Copyright (C) 2010, 2015-2020, D. R. Commander.
* Copyright (C) 2010, 2015-2020, 2022, D. R. Commander.
* Copyright (C) 2015, Google, Inc.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
@ -159,6 +159,7 @@ jpeg_crop_scanline(j_decompress_ptr cinfo, JDIMENSION *xoffset,
JDIMENSION input_xoffset;
boolean reinit_upsampler = FALSE;
jpeg_component_info *compptr;
my_master_ptr master = (my_master_ptr)cinfo->master;
if (cinfo->global_state != DSTATE_SCANNING || cinfo->output_scanline != 0)
ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
@ -208,6 +209,11 @@ jpeg_crop_scanline(j_decompress_ptr cinfo, JDIMENSION *xoffset,
*/
*width = *width + input_xoffset - *xoffset;
cinfo->output_width = *width;
if (master->using_merged_upsample && cinfo->max_v_samp_factor == 2) {
my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
upsample->out_row_width =
cinfo->output_width * cinfo->out_color_components;
}
/* Set the first and last iMCU columns that we must decompress. These values
* will be used in single-scan decompressions.

@ -4,7 +4,7 @@
* This file was part of the Independent JPEG Group's software:
* Developed 1997-2015 by Guido Vollbeding.
* libjpeg-turbo Modifications:
* Copyright (C) 2015-2020, D. R. Commander.
* Copyright (C) 2015-2020, 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@ -210,13 +210,13 @@ process_restart(j_decompress_ptr cinfo)
for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
compptr = cinfo->cur_comp_info[ci];
if (!cinfo->progressive_mode || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
MEMZERO(entropy->dc_stats[compptr->dc_tbl_no], DC_STAT_BINS);
memset(entropy->dc_stats[compptr->dc_tbl_no], 0, DC_STAT_BINS);
/* Reset DC predictions to 0 */
entropy->last_dc_val[ci] = 0;
entropy->dc_context[ci] = 0;
}
if (!cinfo->progressive_mode || cinfo->Ss) {
MEMZERO(entropy->ac_stats[compptr->ac_tbl_no], AC_STAT_BINS);
memset(entropy->ac_stats[compptr->ac_tbl_no], 0, AC_STAT_BINS);
}
}
@ -471,17 +471,17 @@ decode_mcu_AC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
if (*thiscoef) { /* previously nonzero coef */
if (arith_decode(cinfo, st + 2)) {
if (*thiscoef < 0)
*thiscoef += m1;
*thiscoef += (JCOEF)m1;
else
*thiscoef += p1;
*thiscoef += (JCOEF)p1;
}
break;
}
if (arith_decode(cinfo, st + 1)) { /* newly nonzero coef */
if (arith_decode(cinfo, entropy->fixed_bin))
*thiscoef = m1;
*thiscoef = (JCOEF)m1;
else
*thiscoef = p1;
*thiscoef = (JCOEF)p1;
break;
}
st += 3; k++;
@ -698,8 +698,8 @@ bad:
/* Check that the scan parameters Ss, Se, Ah/Al are OK for sequential JPEG.
* This ought to be an error condition, but we make it a warning.
*/
if (cinfo->Ss != 0 || cinfo->Ah != 0 || cinfo->Al != 0 ||
(cinfo->Se < DCTSIZE2 && cinfo->Se != DCTSIZE2 - 1))
if (cinfo->Ss != 0 || cinfo->Se != DCTSIZE2 - 1 ||
cinfo->Ah != 0 || cinfo->Al != 0)
WARNMS(cinfo, JWRN_NOT_SEQUENTIAL);
/* Select MCU decoding routine */
entropy->pub.decode_mcu = decode_mcu;
@ -715,7 +715,7 @@ bad:
if (entropy->dc_stats[tbl] == NULL)
entropy->dc_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small)
((j_common_ptr)cinfo, JPOOL_IMAGE, DC_STAT_BINS);
MEMZERO(entropy->dc_stats[tbl], DC_STAT_BINS);
memset(entropy->dc_stats[tbl], 0, DC_STAT_BINS);
/* Initialize DC predictions to 0 */
entropy->last_dc_val[ci] = 0;
entropy->dc_context[ci] = 0;
@ -727,7 +727,7 @@ bad:
if (entropy->ac_stats[tbl] == NULL)
entropy->ac_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small)
((j_common_ptr)cinfo, JPOOL_IMAGE, AC_STAT_BINS);
MEMZERO(entropy->ac_stats[tbl], AC_STAT_BINS);
memset(entropy->ac_stats[tbl], 0, AC_STAT_BINS);
}
}

@ -5,7 +5,7 @@
* Copyright (C) 1994-1996, Thomas G. Lane.
* Modified 2009-2012 by Guido Vollbeding.
* libjpeg-turbo Modifications:
* Copyright (C) 2013, 2016, D. R. Commander.
* Copyright (C) 2013, 2016, 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@ -23,11 +23,6 @@
#include "jpeglib.h"
#include "jerror.h"
#ifndef HAVE_STDLIB_H /* <stdlib.h> should declare malloc(),free() */
extern void *malloc(size_t size);
extern void free(void *ptr);
#endif
/* Expanded data destination object for stdio output */
@ -116,7 +111,7 @@ empty_output_buffer(j_compress_ptr cinfo)
{
my_dest_ptr dest = (my_dest_ptr)cinfo->dest;
if (JFWRITE(dest->outfile, dest->buffer, OUTPUT_BUF_SIZE) !=
if (fwrite(dest->buffer, 1, OUTPUT_BUF_SIZE, dest->outfile) !=
(size_t)OUTPUT_BUF_SIZE)
ERREXIT(cinfo, JERR_FILE_WRITE);
@ -141,7 +136,7 @@ empty_mem_output_buffer(j_compress_ptr cinfo)
if (nextbuffer == NULL)
ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 10);
MEMCOPY(nextbuffer, dest->buffer, dest->bufsize);
memcpy(nextbuffer, dest->buffer, dest->bufsize);
free(dest->newbuffer);
@ -175,7 +170,7 @@ term_destination(j_compress_ptr cinfo)
/* Write any data remaining in the buffer */
if (datacount > 0) {
if (JFWRITE(dest->outfile, dest->buffer, datacount) != datacount)
if (fwrite(dest->buffer, 1, datacount, dest->outfile) != datacount)
ERREXIT(cinfo, JERR_FILE_WRITE);
}
fflush(dest->outfile);

@ -5,7 +5,7 @@
* Copyright (C) 1994-1996, Thomas G. Lane.
* Modified 2009-2011 by Guido Vollbeding.
* libjpeg-turbo Modifications:
* Copyright (C) 2013, 2016, D. R. Commander.
* Copyright (C) 2013, 2016, 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@ -104,7 +104,7 @@ fill_input_buffer(j_decompress_ptr cinfo)
my_src_ptr src = (my_src_ptr)cinfo->src;
size_t nbytes;
nbytes = JFREAD(src->infile, src->buffer, INPUT_BUF_SIZE);
nbytes = fread(src->buffer, 1, INPUT_BUF_SIZE, src->infile);
if (nbytes <= 0) {
if (src->start_of_file) /* Treat empty input file as fatal error */

@ -6,7 +6,7 @@
* Modified 2002-2010 by Guido Vollbeding.
* libjpeg-turbo Modifications:
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2010, 2015, D. R. Commander.
* Copyright (C) 2010, 2015, 2022, D. R. Commander.
* Copyright (C) 2013, MIPS Technologies, Inc., California.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
@ -345,7 +345,7 @@ jinit_inverse_dct(j_decompress_ptr cinfo)
compptr->dct_table =
(*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(multiplier_table));
MEMZERO(compptr->dct_table, sizeof(multiplier_table));
memset(compptr->dct_table, 0, sizeof(multiplier_table));
/* Mark multiplier table not yet set up for any method */
idct->cur_method[ci] = -1;
}

@ -18,10 +18,6 @@
#include "jpeglib.h"
#include "jerror.h"
#ifndef HAVE_STDLIB_H /* <stdlib.h> should declare malloc() */
extern void *malloc(size_t size);
#endif
#define ICC_MARKER (JPEG_APP0 + 2) /* JPEG marker code for ICC */
#define ICC_OVERHEAD_LEN 14 /* size of non-profile data in APP2 */

@ -4,7 +4,7 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1997, Thomas G. Lane.
* libjpeg-turbo Modifications:
* Copyright (C) 2010, 2016, 2018, D. R. Commander.
* Copyright (C) 2010, 2016, 2018, 2022, D. R. Commander.
* Copyright (C) 2015, Google, Inc.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
@ -264,7 +264,7 @@ latch_quant_tables(j_decompress_ptr cinfo)
qtbl = (JQUANT_TBL *)
(*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(JQUANT_TBL));
MEMCOPY(qtbl, cinfo->quant_tbl_ptrs[qtblno], sizeof(JQUANT_TBL));
memcpy(qtbl, cinfo->quant_tbl_ptrs[qtblno], sizeof(JQUANT_TBL));
compptr->quant_table = qtbl;
}
}

@ -4,7 +4,7 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1998, Thomas G. Lane.
* libjpeg-turbo Modifications:
* Copyright (C) 2012, 2015, D. R. Commander.
* Copyright (C) 2012, 2015, 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@ -473,7 +473,7 @@ get_dht(j_decompress_ptr cinfo)
for (i = 0; i < count; i++)
INPUT_BYTE(cinfo, huffval[i], return FALSE);
MEMZERO(&huffval[count], (256 - count) * sizeof(UINT8));
memset(&huffval[count], 0, (256 - count) * sizeof(UINT8));
length -= count;
@ -491,8 +491,8 @@ get_dht(j_decompress_ptr cinfo)
if (*htblptr == NULL)
*htblptr = jpeg_alloc_huff_table((j_common_ptr)cinfo);
MEMCOPY((*htblptr)->bits, bits, sizeof((*htblptr)->bits));
MEMCOPY((*htblptr)->huffval, huffval, sizeof((*htblptr)->huffval));
memcpy((*htblptr)->bits, bits, sizeof((*htblptr)->bits));
memcpy((*htblptr)->huffval, huffval, sizeof((*htblptr)->huffval));
}
if (length != 0)

@ -5,7 +5,7 @@
* Copyright (C) 1991-1997, Thomas G. Lane.
* Modified 2002-2009 by Guido Vollbeding.
* libjpeg-turbo Modifications:
* Copyright (C) 2009-2011, 2016, 2019, D. R. Commander.
* Copyright (C) 2009-2011, 2016, 2019, 2022, D. R. Commander.
* Copyright (C) 2013, Linaro Limited.
* Copyright (C) 2015, Google, Inc.
* For conditions of distribution and use, see the accompanying README.ijg
@ -417,7 +417,7 @@ prepare_range_limit_table(j_decompress_ptr cinfo)
table += (MAXJSAMPLE + 1); /* allow negative subscripts of simple table */
cinfo->sample_range_limit = table;
/* First segment of "simple" table: limit[x] = 0 for x < 0 */
MEMZERO(table - (MAXJSAMPLE + 1), (MAXJSAMPLE + 1) * sizeof(JSAMPLE));
memset(table - (MAXJSAMPLE + 1), 0, (MAXJSAMPLE + 1) * sizeof(JSAMPLE));
/* Main part of "simple" table: limit[x] = x */
for (i = 0; i <= MAXJSAMPLE; i++)
table[i] = (JSAMPLE)i;
@ -426,10 +426,10 @@ prepare_range_limit_table(j_decompress_ptr cinfo)
for (i = CENTERJSAMPLE; i < 2 * (MAXJSAMPLE + 1); i++)
table[i] = MAXJSAMPLE;
/* Second half of post-IDCT table */
MEMZERO(table + (2 * (MAXJSAMPLE + 1)),
(2 * (MAXJSAMPLE + 1) - CENTERJSAMPLE) * sizeof(JSAMPLE));
MEMCOPY(table + (4 * (MAXJSAMPLE + 1) - CENTERJSAMPLE),
cinfo->sample_range_limit, CENTERJSAMPLE * sizeof(JSAMPLE));
memset(table + (2 * (MAXJSAMPLE + 1)), 0,
(2 * (MAXJSAMPLE + 1) - CENTERJSAMPLE) * sizeof(JSAMPLE));
memcpy(table + (4 * (MAXJSAMPLE + 1) - CENTERJSAMPLE),
cinfo->sample_range_limit, CENTERJSAMPLE * sizeof(JSAMPLE));
}

@ -4,7 +4,7 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1995-1997, Thomas G. Lane.
* libjpeg-turbo Modifications:
* Copyright (C) 2015-2016, 2018-2021, D. R. Commander.
* Copyright (C) 2015-2016, 2018-2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@ -578,9 +578,9 @@ decode_mcu_AC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
if (GET_BITS(1)) {
if ((*thiscoef & p1) == 0) { /* do nothing if already set it */
if (*thiscoef >= 0)
*thiscoef += p1;
*thiscoef += (JCOEF)p1;
else
*thiscoef += m1;
*thiscoef += (JCOEF)m1;
}
}
} else {
@ -612,9 +612,9 @@ decode_mcu_AC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
if (GET_BITS(1)) {
if ((*thiscoef & p1) == 0) { /* do nothing if already changed it */
if (*thiscoef >= 0)
*thiscoef += p1;
*thiscoef += (JCOEF)p1;
else
*thiscoef += m1;
*thiscoef += (JCOEF)m1;
}
}
}

@ -3,8 +3,8 @@
*
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1998, Thomas G. Lane.
* It was modified by The libjpeg-turbo Project to include only code relevant
* to libjpeg-turbo.
* libjpeg-turbo Modifications:
* Copyright (C) 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@ -189,13 +189,13 @@ format_message(j_common_ptr cinfo, char *buffer)
/* Format the message into the passed buffer */
if (isstring)
sprintf(buffer, msgtext, err->msg_parm.s);
snprintf(buffer, JMSG_LENGTH_MAX, msgtext, err->msg_parm.s);
else
sprintf(buffer, msgtext,
err->msg_parm.i[0], err->msg_parm.i[1],
err->msg_parm.i[2], err->msg_parm.i[3],
err->msg_parm.i[4], err->msg_parm.i[5],
err->msg_parm.i[6], err->msg_parm.i[7]);
snprintf(buffer, JMSG_LENGTH_MAX, msgtext,
err->msg_parm.i[0], err->msg_parm.i[1],
err->msg_parm.i[2], err->msg_parm.i[3],
err->msg_parm.i[4], err->msg_parm.i[5],
err->msg_parm.i[6], err->msg_parm.i[7]);
}

@ -5,7 +5,7 @@
* Copyright (C) 1994-1997, Thomas G. Lane.
* Modified 1997-2009 by Guido Vollbeding.
* libjpeg-turbo Modifications:
* Copyright (C) 2014, 2017, D. R. Commander.
* Copyright (C) 2014, 2017, 2021-2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@ -103,7 +103,7 @@ JMESSAGE(JERR_MISMATCHED_QUANT_TABLE,
"Cannot transcode due to multiple use of quantization table %d")
JMESSAGE(JERR_MISSING_DATA, "Scan script does not transmit all data")
JMESSAGE(JERR_MODE_CHANGE, "Invalid color quantization mode change")
JMESSAGE(JERR_NOTIMPL, "Not implemented yet")
JMESSAGE(JERR_NOTIMPL, "Requested features are incompatible")
JMESSAGE(JERR_NOT_COMPILED, "Requested feature was omitted at compile time")
#if JPEG_LIB_VERSION >= 70
JMESSAGE(JERR_NO_ARITH_TABLE, "Arithmetic table 0x%02x was not defined")
@ -268,6 +268,7 @@ JMESSAGE(JERR_BAD_DROP_SAMPLING,
#define ERREXITS(cinfo, code, str) \
((cinfo)->err->msg_code = (code), \
strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \
(cinfo)->err->msg_parm.s[JMSG_STR_PARM_MAX - 1] = '\0', \
(*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
#define MAKESTMT(stuff) do { stuff } while (0)
@ -324,6 +325,7 @@ JMESSAGE(JERR_BAD_DROP_SAMPLING,
#define TRACEMSS(cinfo, lvl, code, str) \
((cinfo)->err->msg_code = (code), \
strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \
(cinfo)->err->msg_parm.s[JMSG_STR_PARM_MAX - 1] = '\0', \
(*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)))
#endif /* JERROR_H */

@ -3,8 +3,8 @@
*
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1994, Thomas G. Lane.
* It was modified by The libjpeg-turbo Project to include only code relevant
* to libjpeg-turbo.
* libjpeg-turbo Modifications:
* Copyright (C) 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@ -17,72 +17,117 @@
* JPEG library. Most applications need only include jpeglib.h.
*/
#ifndef __JINCLUDE_H__
#define __JINCLUDE_H__
/* Include auto-config file to find out which system include files we need. */
#include "jconfig.h" /* auto configuration options */
#include "jconfigint.h"
#define JCONFIG_INCLUDED /* so that jpeglib.h doesn't do it again */
/*
* We need the NULL macro and size_t typedef.
* On an ANSI-conforming system it is sufficient to include <stddef.h>.
* Otherwise, we get them from <stdlib.h> or <stdio.h>; we may have to
* pull in <sys/types.h> as well.
* Note that the core JPEG library does not require <stdio.h>;
* only the default error handler and data source/destination modules do.
* But we must pull it in because of the references to FILE in jpeglib.h.
* You can remove those references if you want to compile without <stdio.h>.
*/
#ifdef HAVE_STDDEF_H
#include <stddef.h>
#endif
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif
#ifdef NEED_SYS_TYPES_H
#include <sys/types.h>
#endif
#include <stdio.h>
#include <string.h>
/*
* We need memory copying and zeroing functions, plus strncpy().
* ANSI and System V implementations declare these in <string.h>.
* BSD doesn't have the mem() functions, but it does have bcopy()/bzero().
* Some systems may declare memset and memcpy in <memory.h>.
*
* NOTE: we assume the size parameters to these functions are of type size_t.
* Change the casts in these macros if not!
* These macros/inline functions facilitate using Microsoft's "safe string"
* functions with Visual Studio builds without the need to scatter #ifdefs
* throughout the code base.
*/
#ifdef NEED_BSD_STRINGS
#include <strings.h>
#define MEMZERO(target, size) \
bzero((void *)(target), (size_t)(size))
#define MEMCOPY(dest, src, size) \
bcopy((const void *)(src), (void *)(dest), (size_t)(size))
#ifndef NO_GETENV
#else /* not BSD, assume ANSI/SysV string lib */
#ifdef _MSC_VER
#include <string.h>
#define MEMZERO(target, size) \
memset((void *)(target), 0, (size_t)(size))
#define MEMCOPY(dest, src, size) \
memcpy((void *)(dest), (const void *)(src), (size_t)(size))
static INLINE int GETENV_S(char *buffer, size_t buffer_size, const char *name)
{
size_t required_size;
#endif
return (int)getenv_s(&required_size, buffer, buffer_size, name);
}
/*
* The modules that use fread() and fwrite() always invoke them through
* these macros. On some systems you may need to twiddle the argument casts.
* CAUTION: argument order is different from underlying functions!
#else /* _MSC_VER */
#include <errno.h>
/* This provides a similar interface to the Microsoft/C11 getenv_s() function,
* but other than parameter validation, it has no advantages over getenv().
*/
static INLINE int GETENV_S(char *buffer, size_t buffer_size, const char *name)
{
char *env;
if (!buffer) {
if (buffer_size == 0)
return 0;
else
return (errno = EINVAL);
}
if (buffer_size == 0)
return (errno = EINVAL);
if (!name) {
*buffer = 0;
return 0;
}
env = getenv(name);
if (!env)
{
*buffer = 0;
return 0;
}
if (strlen(env) + 1 > buffer_size) {
*buffer = 0;
return ERANGE;
}
strncpy(buffer, env, buffer_size);
return 0;
}
#endif /* _MSC_VER */
#endif /* NO_GETENV */
#ifndef NO_PUTENV
#ifdef _WIN32
#define PUTENV_S(name, value) _putenv_s(name, value)
#else
/* This provides a similar interface to the Microsoft _putenv_s() function, but
* other than parameter validation, it has no advantages over setenv().
*/
#define JFREAD(file, buf, sizeofbuf) \
((size_t)fread((void *)(buf), (size_t)1, (size_t)(sizeofbuf), (file)))
#define JFWRITE(file, buf, sizeofbuf) \
((size_t)fwrite((const void *)(buf), (size_t)1, (size_t)(sizeofbuf), (file)))
static INLINE int PUTENV_S(const char *name, const char *value)
{
if (!name || !value)
return (errno = EINVAL);
setenv(name, value, 1);
return errno;
}
#endif /* _WIN32 */
#endif /* NO_PUTENV */
#endif /* JINCLUDE_H */

@ -4,7 +4,7 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1997, Thomas G. Lane.
* libjpeg-turbo Modifications:
* Copyright (C) 2016, 2021, D. R. Commander.
* Copyright (C) 2016, 2021-2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@ -37,12 +37,6 @@
#endif
#include <limits.h>
#ifndef NO_GETENV
#ifndef HAVE_STDLIB_H /* <stdlib.h> should declare getenv() */
extern char *getenv(const char *name);
#endif
#endif
LOCAL(size_t)
round_up_pow2(size_t a, size_t b)
@ -1162,12 +1156,16 @@ jinit_memory_mgr(j_common_ptr cinfo)
*/
#ifndef NO_GETENV
{
char *memenv;
char memenv[30] = { 0 };
if ((memenv = getenv("JPEGMEM")) != NULL) {
if (!GETENV_S(memenv, 30, "JPEGMEM") && strlen(memenv) > 0) {
char ch = 'x';
#ifdef _MSC_VER
if (sscanf_s(memenv, "%ld%c", &max_to_use, &ch, 1) > 0) {
#else
if (sscanf(memenv, "%ld%c", &max_to_use, &ch) > 0) {
#endif
if (ch == 'm' || ch == 'M')
max_to_use *= 1000L;
mem->pub.max_memory_to_use = max_to_use * 1000L;

@ -22,11 +22,6 @@
#include "jpeglib.h"
#include "jmemsys.h" /* import the system-dependent declarations */
#ifndef HAVE_STDLIB_H /* <stdlib.h> should declare malloc(),free() */
extern void *malloc(size_t size);
extern void free(void *ptr);
#endif
/*
* Memory allocation and freeing are controlled by the regular library

@ -100,11 +100,7 @@ typedef unsigned char UINT8;
/* UINT16 must hold at least the values 0..65535. */
#ifdef HAVE_UNSIGNED_SHORT
typedef unsigned short UINT16;
#else /* not HAVE_UNSIGNED_SHORT */
typedef unsigned int UINT16;
#endif /* HAVE_UNSIGNED_SHORT */
/* INT16 must hold at least the values -32768..32767. */

@ -373,12 +373,3 @@ extern const int jpeg_natural_order[]; /* zigzag coef order to natural order */
/* Arithmetic coding probability estimation tables in jaricom.c */
extern const JLONG jpeg_aritab[];
/* Suppress undefined-structure complaints if necessary. */
#ifdef INCOMPLETE_TYPES_BROKEN
#ifndef AM_MEMORY_MANAGER /* only jmemmgr.c defines these */
struct jvirt_sarray_control { long dummy; };
struct jvirt_barray_control { long dummy; };
#endif
#endif /* INCOMPLETE_TYPES_BROKEN */

@ -4,7 +4,7 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1998, Thomas G. Lane.
* libjpeg-turbo Modifications:
* Copyright (C) 2013, D. R. Commander.
* Copyright (C) 2013, 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@ -29,7 +29,7 @@ add_huff_table(j_common_ptr cinfo, JHUFF_TBL **htblptr, const UINT8 *bits,
return;
/* Copy the number-of-symbols-of-each-code-length counts */
MEMCOPY((*htblptr)->bits, bits, sizeof((*htblptr)->bits));
memcpy((*htblptr)->bits, bits, sizeof((*htblptr)->bits));
/* Validate the counts. We do this here mainly so we can copy the right
* number of symbols from the val[] array, without risking marching off
@ -41,8 +41,9 @@ add_huff_table(j_common_ptr cinfo, JHUFF_TBL **htblptr, const UINT8 *bits,
if (nsymbols < 1 || nsymbols > 256)
ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
MEMCOPY((*htblptr)->huffval, val, nsymbols * sizeof(UINT8));
MEMZERO(&((*htblptr)->huffval[nsymbols]), (256 - nsymbols) * sizeof(UINT8));
memcpy((*htblptr)->huffval, val, nsymbols * sizeof(UINT8));
memset(&((*htblptr)->huffval[nsymbols]), 0,
(256 - nsymbols) * sizeof(UINT8));
/* Initialize sent_table FALSE so table will be written to JPEG file. */
(*htblptr)->sent_table = FALSE;

@ -3,8 +3,8 @@
*
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1996, Thomas G. Lane.
* It was modified by The libjpeg-turbo Project to include only code
* relevant to libjpeg-turbo.
* libjpeg-turbo Modifications:
* Copyright (C) 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@ -110,7 +110,7 @@ jcopy_sample_rows(JSAMPARRAY input_array, int source_row,
for (row = num_rows; row > 0; row--) {
inptr = *input_array++;
outptr = *output_array++;
MEMCOPY(outptr, inptr, count);
memcpy(outptr, inptr, count);
}
}
@ -120,7 +120,7 @@ jcopy_block_row(JBLOCKROW input_row, JBLOCKROW output_row,
JDIMENSION num_blocks)
/* Copy a row of coefficient blocks from one place to another. */
{
MEMCOPY(output_row, input_row, num_blocks * (DCTSIZE2 * sizeof(JCOEF)));
memcpy(output_row, input_row, num_blocks * (DCTSIZE2 * sizeof(JCOEF)));
}
@ -129,5 +129,5 @@ jzero_far(void *target, size_t bytestozero)
/* Zero out a chunk of memory. */
/* This might be sample-array data, block-array data, or alloc_large data. */
{
MEMZERO(target, bytestozero);
memset(target, 0, bytestozero);
}

@ -0,0 +1,54 @@
/*
* jversion.h
*
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-2020, Thomas G. Lane, Guido Vollbeding.
* libjpeg-turbo Modifications:
* Copyright (C) 2010, 2012-2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
* This file contains software version identification.
*/
#if JPEG_LIB_VERSION >= 80
#define JVERSION "8d 15-Jan-2012"
#elif JPEG_LIB_VERSION >= 70
#define JVERSION "7 27-Jun-2009"
#else
#define JVERSION "6b 27-Mar-1998"
#endif
/*
* NOTE: It is our convention to place the authors in the following order:
* - libjpeg-turbo authors (2009-) in descending order of the date of their
* most recent contribution to the project, then in ascending order of the
* date of their first contribution to the project, then in alphabetical
* order
* - Upstream authors in descending order of the date of the first inclusion of
* their code
*/
#define JCOPYRIGHT \
"Copyright (C) 2009-2022 D. R. Commander\n" \
"Copyright (C) 2015, 2020 Google, Inc.\n" \
"Copyright (C) 2019-2020 Arm Limited\n" \
"Copyright (C) 2015-2016, 2018 Matthieu Darbois\n" \
"Copyright (C) 2011-2016 Siarhei Siamashka\n" \
"Copyright (C) 2015 Intel Corporation\n" \
"Copyright (C) 2013-2014 Linaro Limited\n" \
"Copyright (C) 2013-2014 MIPS Technologies, Inc.\n" \
"Copyright (C) 2009, 2012 Pierre Ossman for Cendio AB\n" \
"Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies)\n" \
"Copyright (C) 1999-2006 MIYASAKA Masaru\n" \
"Copyright (C) 1991-2020 Thomas G. Lane, Guido Vollbeding"
#define JCOPYRIGHT_SHORT \
"Copyright (C) @COPYRIGHT_YEAR@ The libjpeg-turbo Project and many others"

@ -0,0 +1,540 @@
macro(simd_fail message)
if(REQUIRE_SIMD)
message(FATAL_ERROR "${message}.")
else()
message(WARNING "${message}. Performance will suffer.")
set(WITH_SIMD 0 PARENT_SCOPE)
endif()
endmacro()
###############################################################################
# x86[-64] (NASM)
###############################################################################
if(CPU_TYPE STREQUAL "x86_64" OR CPU_TYPE STREQUAL "i386")
set(CMAKE_ASM_NASM_FLAGS_DEBUG_INIT "-g")
set(CMAKE_ASM_NASM_FLAGS_RELWITHDEBINFO_INIT "-g")
# Allow the location of the NASM executable to be specified using the ASM_NASM
# environment variable. This should happen automatically, but unfortunately
# enable_language(ASM_NASM) doesn't parse the ASM_NASM environment variable
# until after CMAKE_ASM_NASM_COMPILER has been populated with the results of
# searching for NASM or Yasm in the PATH.
if(NOT DEFINED CMAKE_ASM_NASM_COMPILER AND DEFINED ENV{ASM_NASM})
set(CMAKE_ASM_NASM_COMPILER $ENV{ASM_NASM})
endif()
if(CPU_TYPE STREQUAL "x86_64")
if(CYGWIN)
set(CMAKE_ASM_NASM_OBJECT_FORMAT win64)
endif()
if(CMAKE_C_COMPILER_ABI MATCHES "ELF X32")
set(CMAKE_ASM_NASM_OBJECT_FORMAT elfx32)
endif()
elseif(CPU_TYPE STREQUAL "i386")
if(BORLAND)
set(CMAKE_ASM_NASM_OBJECT_FORMAT obj)
elseif(CYGWIN)
set(CMAKE_ASM_NASM_OBJECT_FORMAT win32)
endif()
endif()
if(NOT REQUIRE_SIMD)
include(CheckLanguage)
check_language(ASM_NASM)
if(NOT CMAKE_ASM_NASM_COMPILER)
simd_fail("SIMD extensions disabled: could not find NASM compiler")
return()
endif()
endif()
enable_language(ASM_NASM)
message(STATUS "CMAKE_ASM_NASM_COMPILER = ${CMAKE_ASM_NASM_COMPILER}")
if(CMAKE_ASM_NASM_OBJECT_FORMAT MATCHES "^macho")
set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -DMACHO")
elseif(CMAKE_ASM_NASM_OBJECT_FORMAT MATCHES "^elf")
set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -DELF")
set(CMAKE_ASM_NASM_DEBUG_FORMAT "dwarf2")
endif()
if(CPU_TYPE STREQUAL "x86_64")
if(WIN32 OR CYGWIN)
set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -DWIN64")
endif()
set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -D__x86_64__")
elseif(CPU_TYPE STREQUAL "i386")
if(BORLAND)
set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -DOBJ32")
elseif(WIN32 OR CYGWIN)
set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -DWIN32")
endif()
endif()
message(STATUS "CMAKE_ASM_NASM_OBJECT_FORMAT = ${CMAKE_ASM_NASM_OBJECT_FORMAT}")
if(NOT CMAKE_ASM_NASM_OBJECT_FORMAT)
simd_fail("SIMD extensions disabled: could not determine NASM object format")
return()
endif()
get_filename_component(CMAKE_ASM_NASM_COMPILER_TYPE
"${CMAKE_ASM_NASM_COMPILER}" NAME_WE)
if(CMAKE_ASM_NASM_COMPILER_TYPE MATCHES "yasm")
foreach(var CMAKE_ASM_NASM_FLAGS_DEBUG CMAKE_ASM_NASM_FLAGS_RELWITHDEBINFO)
if(${var} STREQUAL "-g")
if(CMAKE_ASM_NASM_DEBUG_FORMAT)
set_property(CACHE ${var} PROPERTY VALUE "-g ${CMAKE_ASM_NASM_DEBUG_FORMAT}")
else()
set_property(CACHE ${var} PROPERTY VALUE "")
endif()
endif()
endforeach()
endif()
if(NOT WIN32 AND (CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED))
set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -DPIC")
endif()
string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC)
set(EFFECTIVE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} ${CMAKE_ASM_NASM_FLAGS_${CMAKE_BUILD_TYPE_UC}}")
message(STATUS "CMAKE_ASM_NASM_FLAGS = ${EFFECTIVE_ASM_NASM_FLAGS}")
set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -I\"${CMAKE_CURRENT_SOURCE_DIR}/nasm/\" -I\"${CMAKE_CURRENT_SOURCE_DIR}/${CPU_TYPE}/\"")
set(GREP grep)
if(CMAKE_SYSTEM_NAME STREQUAL "SunOS")
set(GREP ggrep)
endif()
add_custom_target(jsimdcfg COMMAND
${CMAKE_C_COMPILER} -E -I${CMAKE_BINARY_DIR} -I${CMAKE_CURRENT_BINARY_DIR}
-I${CMAKE_CURRENT_SOURCE_DIR}
${CMAKE_CURRENT_SOURCE_DIR}/nasm/jsimdcfg.inc.h |
${GREP} -E '^[\;%]|^\ %' | sed 's%_cpp_protection_%%' |
sed 's@% define@%define@g' >${CMAKE_CURRENT_SOURCE_DIR}/nasm/jsimdcfg.inc)
if(CPU_TYPE STREQUAL "x86_64")
set(SIMD_SOURCES x86_64/jsimdcpu.asm x86_64/jfdctflt-sse.asm
x86_64/jccolor-sse2.asm x86_64/jcgray-sse2.asm x86_64/jchuff-sse2.asm
x86_64/jcphuff-sse2.asm x86_64/jcsample-sse2.asm x86_64/jdcolor-sse2.asm
x86_64/jdmerge-sse2.asm x86_64/jdsample-sse2.asm x86_64/jfdctfst-sse2.asm
x86_64/jfdctint-sse2.asm x86_64/jidctflt-sse2.asm x86_64/jidctfst-sse2.asm
x86_64/jidctint-sse2.asm x86_64/jidctred-sse2.asm x86_64/jquantf-sse2.asm
x86_64/jquanti-sse2.asm
x86_64/jccolor-avx2.asm x86_64/jcgray-avx2.asm x86_64/jcsample-avx2.asm
x86_64/jdcolor-avx2.asm x86_64/jdmerge-avx2.asm x86_64/jdsample-avx2.asm
x86_64/jfdctint-avx2.asm x86_64/jidctint-avx2.asm x86_64/jquanti-avx2.asm)
else()
set(SIMD_SOURCES i386/jsimdcpu.asm i386/jfdctflt-3dn.asm
i386/jidctflt-3dn.asm i386/jquant-3dn.asm
i386/jccolor-mmx.asm i386/jcgray-mmx.asm i386/jcsample-mmx.asm
i386/jdcolor-mmx.asm i386/jdmerge-mmx.asm i386/jdsample-mmx.asm
i386/jfdctfst-mmx.asm i386/jfdctint-mmx.asm i386/jidctfst-mmx.asm
i386/jidctint-mmx.asm i386/jidctred-mmx.asm i386/jquant-mmx.asm
i386/jfdctflt-sse.asm i386/jidctflt-sse.asm i386/jquant-sse.asm
i386/jccolor-sse2.asm i386/jcgray-sse2.asm i386/jchuff-sse2.asm
i386/jcphuff-sse2.asm i386/jcsample-sse2.asm i386/jdcolor-sse2.asm
i386/jdmerge-sse2.asm i386/jdsample-sse2.asm i386/jfdctfst-sse2.asm
i386/jfdctint-sse2.asm i386/jidctflt-sse2.asm i386/jidctfst-sse2.asm
i386/jidctint-sse2.asm i386/jidctred-sse2.asm i386/jquantf-sse2.asm
i386/jquanti-sse2.asm
i386/jccolor-avx2.asm i386/jcgray-avx2.asm i386/jcsample-avx2.asm
i386/jdcolor-avx2.asm i386/jdmerge-avx2.asm i386/jdsample-avx2.asm
i386/jfdctint-avx2.asm i386/jidctint-avx2.asm i386/jquanti-avx2.asm)
endif()
if(MSVC_IDE)
set(OBJDIR "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}")
string(REGEX REPLACE " " ";" CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS}")
elseif(XCODE)
set(OBJDIR "${CMAKE_CURRENT_BINARY_DIR}")
string(REGEX REPLACE " " ";" CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS}")
endif()
file(GLOB INC_FILES nasm/*.inc)
foreach(file ${SIMD_SOURCES})
set(OBJECT_DEPENDS "")
if(${file} MATCHES jccolor)
string(REGEX REPLACE "jccolor" "jccolext" DEPFILE ${file})
set(OBJECT_DEPENDS ${OBJECT_DEPENDS}
${CMAKE_CURRENT_SOURCE_DIR}/${DEPFILE})
endif()
if(${file} MATCHES jcgray)
string(REGEX REPLACE "jcgray" "jcgryext" DEPFILE ${file})
set(OBJECT_DEPENDS ${OBJECT_DEPENDS}
${CMAKE_CURRENT_SOURCE_DIR}/${DEPFILE})
endif()
if(${file} MATCHES jdcolor)
string(REGEX REPLACE "jdcolor" "jdcolext" DEPFILE ${file})
set(OBJECT_DEPENDS ${OBJECT_DEPENDS}
${CMAKE_CURRENT_SOURCE_DIR}/${DEPFILE})
endif()
if(${file} MATCHES jdmerge)
string(REGEX REPLACE "jdmerge" "jdmrgext" DEPFILE ${file})
set(OBJECT_DEPENDS ${OBJECT_DEPENDS}
${CMAKE_CURRENT_SOURCE_DIR}/${DEPFILE})
endif()
set(OBJECT_DEPENDS ${OBJECT_DEPENDS} ${INC_FILES})
if(MSVC_IDE OR XCODE)
# The CMake Visual Studio generators do not work properly with the ASM_NASM
# language, so we have to go rogue here and use a custom command like we
# did in prior versions of libjpeg-turbo. (This is why we can't have nice
# things.)
string(REGEX REPLACE "${CPU_TYPE}/" "" filename ${file})
set(SIMD_OBJ ${OBJDIR}/${filename}${CMAKE_C_OUTPUT_EXTENSION})
add_custom_command(OUTPUT ${SIMD_OBJ} DEPENDS ${file} ${OBJECT_DEPENDS}
COMMAND ${CMAKE_ASM_NASM_COMPILER} -f${CMAKE_ASM_NASM_OBJECT_FORMAT}
${CMAKE_ASM_NASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/${file}
-o${SIMD_OBJ})
set(SIMD_OBJS ${SIMD_OBJS} ${SIMD_OBJ})
else()
set_source_files_properties(${file} PROPERTIES OBJECT_DEPENDS
"${OBJECT_DEPENDS}")
endif()
endforeach()
if(MSVC_IDE OR XCODE)
set(SIMD_OBJS ${SIMD_OBJS} PARENT_SCOPE)
add_library(simd OBJECT ${CPU_TYPE}/jsimd.c)
add_custom_target(simd-objs DEPENDS ${SIMD_OBJS})
add_dependencies(simd simd-objs)
else()
add_library(simd OBJECT ${SIMD_SOURCES} ${CPU_TYPE}/jsimd.c)
endif()
if(NOT WIN32 AND (CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED))
set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
endif()
###############################################################################
# Arm (Intrinsics or GAS)
###############################################################################
elseif(CPU_TYPE STREQUAL "arm64" OR CPU_TYPE STREQUAL "arm")
# If Neon instructions are not explicitly enabled at compile time (e.g. using
# -mfpu=neon) with an AArch32 Linux or Android build, then the AArch32 SIMD
# dispatcher will parse /proc/cpuinfo to determine whether the Neon SIMD
# extensions can be enabled at run time. In order to support all AArch32 CPUs
# using the same code base, i.e. to support run-time FPU and Neon
# auto-detection, it is necessary to compile the scalar C source code using
# -mfloat-abi=soft (which is usually the default) but compile the intrinsics
# implementation of the Neon SIMD extensions using -mfloat-abi=softfp. The
# following test determines whether -mfloat-abi=softfp should be explicitly
# added to the compile flags for the intrinsics implementation of the Neon SIMD
# extensions.
if(BITS EQUAL 32)
check_c_source_compiles("
#if defined(__ARM_NEON__) || (!defined(__linux__) && !defined(ANDROID) && !defined(__ANDROID__))
#error \"Neon run-time auto-detection will not be used\"
#endif
#if __ARM_PCS_VFP == 1
#error \"float ABI = hard\"
#endif
#if __SOFTFP__ != 1
#error \"float ABI = softfp\"
#endif
int main(void) { return 0; }" NEED_SOFTFP_FOR_INTRINSICS)
if(NEED_SOFTFP_FOR_INTRINSICS)
set(SOFTFP_FLAG -mfloat-abi=softfp)
endif()
endif()
if(BITS EQUAL 32)
set(CMAKE_REQUIRED_FLAGS "-mfpu=neon ${SOFTFP_FLAG}")
check_c_source_compiles("
#include <arm_neon.h>
int main(int argc, char **argv) {
uint16x8_t input = vdupq_n_u16((uint16_t)argc);
uint8x8_t output = vmovn_u16(input);
return (int)output[0];
}" HAVE_NEON)
if(NOT HAVE_NEON)
simd_fail("SIMD extensions not available for this architecture")
return()
endif()
endif()
check_c_source_compiles("
#include <arm_neon.h>
int main(int argc, char **argv) {
int16_t input[] = {
(int16_t)argc, (int16_t)argc, (int16_t)argc, (int16_t)argc,
(int16_t)argc, (int16_t)argc, (int16_t)argc, (int16_t)argc,
(int16_t)argc, (int16_t)argc, (int16_t)argc, (int16_t)argc
};
int16x4x3_t output = vld1_s16_x3(input);
vst3_s16(input, output);
return (int)input[0];
}" HAVE_VLD1_S16_X3)
check_c_source_compiles("
#include <arm_neon.h>
int main(int argc, char **argv) {
uint16_t input[] = {
(uint16_t)argc, (uint16_t)argc, (uint16_t)argc, (uint16_t)argc,
(uint16_t)argc, (uint16_t)argc, (uint16_t)argc, (uint16_t)argc
};
uint16x4x2_t output = vld1_u16_x2(input);
vst2_u16(input, output);
return (int)input[0];
}" HAVE_VLD1_U16_X2)
check_c_source_compiles("
#include <arm_neon.h>
int main(int argc, char **argv) {
uint8_t input[] = {
(uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc,
(uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc,
(uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc,
(uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc,
(uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc,
(uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc,
(uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc,
(uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc,
(uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc,
(uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc,
(uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc,
(uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc,
(uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc,
(uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc,
(uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc,
(uint8_t)argc, (uint8_t)argc, (uint8_t)argc, (uint8_t)argc
};
uint8x16x4_t output = vld1q_u8_x4(input);
vst4q_u8(input, output);
return (int)input[0];
}" HAVE_VLD1Q_U8_X4)
if(BITS EQUAL 32)
unset(CMAKE_REQUIRED_FLAGS)
endif()
configure_file(arm/neon-compat.h.in arm/neon-compat.h @ONLY)
include_directories(${CMAKE_CURRENT_BINARY_DIR}/arm)
# GCC 11 and earlier and some older versions of Clang do not have a full or
# optimal set of Neon intrinsics, so for performance reasons, when using those
# compilers, we default to using the older GAS implementation of the Neon SIMD
# extensions for certain algorithms. The presence or absence of the three
# intrinsics we tested above is a reasonable proxy for this, except with GCC 10
# and 11.
if((HAVE_VLD1_S16_X3 AND HAVE_VLD1_U16_X2 AND HAVE_VLD1Q_U8_X4 AND
(NOT CMAKE_COMPILER_IS_GNUCC OR
CMAKE_C_COMPILER_VERSION VERSION_EQUAL 12.0.0 OR
CMAKE_C_COMPILER_VERSION VERSION_GREATER 12.0.0)))
set(DEFAULT_NEON_INTRINSICS 1)
else()
set(DEFAULT_NEON_INTRINSICS 0)
endif()
option(NEON_INTRINSICS
"Because GCC (as of this writing) and some older versions of Clang do not have a full or optimal set of Neon intrinsics, for performance reasons, the default when building libjpeg-turbo with those compilers is to continue using the older GAS implementation of the Neon SIMD extensions for certain algorithms. Setting this option forces the full Neon intrinsics implementation to be used with all compilers. Unsetting this option forces the hybrid GAS/intrinsics implementation to be used with all compilers."
${DEFAULT_NEON_INTRINSICS})
if(NOT NEON_INTRINSICS)
enable_language(ASM)
set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_ASM_FLAGS}")
# Test whether gas-preprocessor.pl would be needed to build the GAS
# implementation of the Neon SIMD extensions. If so, then automatically
# enable the full Neon intrinsics implementation.
if(CPU_TYPE STREQUAL "arm")
file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/gastest.S "
.text
.fpu neon
.arch armv7a
.object_arch armv4
.arm
pld [r0]
vmovn.u16 d0, q0")
else()
file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/gastest.S "
.text
MYVAR .req x0
movi v0.16b, #100
mov MYVAR, #100
.unreq MYVAR")
endif()
separate_arguments(CMAKE_ASM_FLAGS_SEP UNIX_COMMAND "${CMAKE_ASM_FLAGS}")
execute_process(COMMAND ${CMAKE_ASM_COMPILER} ${CMAKE_ASM_FLAGS_SEP}
-x assembler-with-cpp -c ${CMAKE_CURRENT_BINARY_DIR}/gastest.S
RESULT_VARIABLE RESULT OUTPUT_VARIABLE OUTPUT ERROR_VARIABLE ERROR)
if(NOT RESULT EQUAL 0)
message(WARNING "GAS appears to be broken. Using the full Neon SIMD intrinsics implementation.")
set(NEON_INTRINSICS 1 CACHE INTERNAL "" FORCE)
endif()
endif()
boolean_number(NEON_INTRINSICS PARENT_SCOPE)
if(NEON_INTRINSICS)
add_definitions(-DNEON_INTRINSICS)
message(STATUS "Use full Neon SIMD intrinsics implementation (NEON_INTRINSICS = ${NEON_INTRINSICS})")
else()
message(STATUS "Use partial Neon SIMD intrinsics implementation (NEON_INTRINSICS = ${NEON_INTRINSICS})")
endif()
set(SIMD_SOURCES arm/jcgray-neon.c arm/jcphuff-neon.c arm/jcsample-neon.c
arm/jdmerge-neon.c arm/jdsample-neon.c arm/jfdctfst-neon.c
arm/jidctred-neon.c arm/jquanti-neon.c)
if(NEON_INTRINSICS)
set(SIMD_SOURCES ${SIMD_SOURCES} arm/jccolor-neon.c arm/jidctint-neon.c)
endif()
if(NEON_INTRINSICS OR BITS EQUAL 64)
set(SIMD_SOURCES ${SIMD_SOURCES} arm/jidctfst-neon.c)
endif()
if(NEON_INTRINSICS OR BITS EQUAL 32)
set(SIMD_SOURCES ${SIMD_SOURCES} arm/aarch${BITS}/jchuff-neon.c
arm/jdcolor-neon.c arm/jfdctint-neon.c)
endif()
if(BITS EQUAL 32)
set_source_files_properties(${SIMD_SOURCES} COMPILE_FLAGS "-mfpu=neon ${SOFTFP_FLAG}")
endif()
if(NOT NEON_INTRINSICS)
string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC)
set(EFFECTIVE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CMAKE_ASM_FLAGS_${CMAKE_BUILD_TYPE_UC}}")
message(STATUS "CMAKE_ASM_FLAGS = ${EFFECTIVE_ASM_FLAGS}")
set(SIMD_SOURCES ${SIMD_SOURCES} arm/aarch${BITS}/jsimd_neon.S)
endif()
add_library(simd OBJECT ${SIMD_SOURCES} arm/aarch${BITS}/jsimd.c)
if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
endif()
###############################################################################
# MIPS (GAS)
###############################################################################
elseif(CPU_TYPE STREQUAL "mips" OR CPU_TYPE STREQUAL "mipsel")
enable_language(ASM)
string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC)
set(EFFECTIVE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CMAKE_ASM_FLAGS_${CMAKE_BUILD_TYPE_UC}}")
message(STATUS "CMAKE_ASM_FLAGS = ${EFFECTIVE_ASM_FLAGS}")
set(CMAKE_REQUIRED_FLAGS -mdspr2)
check_c_source_compiles("
#if !(defined(__mips__) && __mips_isa_rev >= 2)
#error MIPS DSPr2 is currently only available on MIPS32r2 platforms.
#endif
int main(void) {
int c = 0, a = 0, b = 0;
__asm__ __volatile__ (
\"precr.qb.ph %[c], %[a], %[b]\"
: [c] \"=r\" (c)
: [a] \"r\" (a), [b] \"r\" (b)
);
return c;
}" HAVE_DSPR2)
unset(CMAKE_REQUIRED_FLAGS)
if(NOT HAVE_DSPR2)
simd_fail("SIMD extensions not available for this CPU")
return()
endif()
add_library(simd OBJECT mips/jsimd_dspr2.S mips/jsimd.c)
if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
endif()
###############################################################################
# MIPS64 (Intrinsics)
###############################################################################
elseif(CPU_TYPE STREQUAL "loongson" OR CPU_TYPE MATCHES "^mips64")
set(CMAKE_REQUIRED_FLAGS -Wa,-mloongson-mmi,-mloongson-ext)
check_c_source_compiles("
int main(void) {
int c = 0, a = 0, b = 0;
asm (
\"paddb %0, %1, %2\"
: \"=f\" (c)
: \"f\" (a), \"f\" (b)
);
return c;
}" HAVE_MMI)
unset(CMAKE_REQUIRED_FLAGS)
if(NOT HAVE_MMI)
simd_fail("SIMD extensions not available for this CPU")
return()
endif()
set(SIMD_SOURCES mips64/jccolor-mmi.c mips64/jcgray-mmi.c mips64/jcsample-mmi.c
mips64/jdcolor-mmi.c mips64/jdmerge-mmi.c mips64/jdsample-mmi.c
mips64/jfdctfst-mmi.c mips64/jfdctint-mmi.c mips64/jidctfst-mmi.c
mips64/jidctint-mmi.c mips64/jquanti-mmi.c)
if(CMAKE_COMPILER_IS_GNUCC)
foreach(file ${SIMD_SOURCES})
set_property(SOURCE ${file} APPEND_STRING PROPERTY COMPILE_FLAGS
" -fno-strict-aliasing")
endforeach()
endif()
foreach(file ${SIMD_SOURCES})
set_property(SOURCE ${file} APPEND_STRING PROPERTY COMPILE_FLAGS
" -Wa,-mloongson-mmi,-mloongson-ext")
endforeach()
add_library(simd OBJECT ${SIMD_SOURCES} mips64/jsimd.c)
if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
endif()
###############################################################################
# PowerPC (Intrinsics)
###############################################################################
elseif(CPU_TYPE STREQUAL "powerpc")
set(CMAKE_REQUIRED_FLAGS -maltivec)
check_c_source_compiles("
#include <altivec.h>
int main(void) {
__vector int vi = { 0, 0, 0, 0 };
int i[4];
vec_st(vi, 0, i);
return i[0];
}" HAVE_ALTIVEC)
unset(CMAKE_REQUIRED_FLAGS)
if(NOT HAVE_ALTIVEC)
simd_fail("SIMD extensions not available for this CPU (PowerPC SPE)")
return()
endif()
set(SIMD_SOURCES powerpc/jccolor-altivec.c powerpc/jcgray-altivec.c
powerpc/jcsample-altivec.c powerpc/jdcolor-altivec.c
powerpc/jdmerge-altivec.c powerpc/jdsample-altivec.c
powerpc/jfdctfst-altivec.c powerpc/jfdctint-altivec.c
powerpc/jidctfst-altivec.c powerpc/jidctint-altivec.c
powerpc/jquanti-altivec.c)
set_source_files_properties(${SIMD_SOURCES} PROPERTIES
COMPILE_FLAGS -maltivec)
add_library(simd OBJECT ${SIMD_SOURCES} powerpc/jsimd.c)
if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
endif()
###############################################################################
# None
###############################################################################
else()
simd_fail("SIMD extensions not available for this CPU (${CMAKE_SYSTEM_PROCESSOR})")
endif() # CPU_TYPE

@ -0,0 +1,148 @@
/*
* jccolext-neon.c - colorspace conversion (32-bit Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* This file is included by jccolor-neon.c */
/* RGB -> YCbCr conversion is defined by the following equations:
* Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
* Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128
* Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128
*
* Avoid floating point arithmetic by using shifted integer constants:
* 0.29899597 = 19595 * 2^-16
* 0.58700561 = 38470 * 2^-16
* 0.11399841 = 7471 * 2^-16
* 0.16874695 = 11059 * 2^-16
* 0.33125305 = 21709 * 2^-16
* 0.50000000 = 32768 * 2^-16
* 0.41868592 = 27439 * 2^-16
* 0.08131409 = 5329 * 2^-16
* These constants are defined in jccolor-neon.c
*
* We add the fixed-point equivalent of 0.5 to Cb and Cr, which effectively
* rounds up or down the result via integer truncation.
*/
void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
JSAMPIMAGE output_buf, JDIMENSION output_row,
int num_rows)
{
/* Pointer to RGB(X/A) input data */
JSAMPROW inptr;
/* Pointers to Y, Cb, and Cr output data */
JSAMPROW outptr0, outptr1, outptr2;
/* Allocate temporary buffer for final (image_width % 8) pixels in row. */
ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE];
/* Set up conversion constants. */
#ifdef HAVE_VLD1_U16_X2
const uint16x4x2_t consts = vld1_u16_x2(jsimd_rgb_ycc_neon_consts);
#else
/* GCC does not currently support the intrinsic vld1_<type>_x2(). */
const uint16x4_t consts1 = vld1_u16(jsimd_rgb_ycc_neon_consts);
const uint16x4_t consts2 = vld1_u16(jsimd_rgb_ycc_neon_consts + 4);
const uint16x4x2_t consts = { { consts1, consts2 } };
#endif
const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
while (--num_rows >= 0) {
inptr = *input_buf++;
outptr0 = output_buf[0][output_row];
outptr1 = output_buf[1][output_row];
outptr2 = output_buf[2][output_row];
output_row++;
int cols_remaining = image_width;
for (; cols_remaining > 0; cols_remaining -= 8) {
/* To prevent buffer overread by the vector load instructions, the last
* (image_width % 8) columns of data are first memcopied to a temporary
* buffer large enough to accommodate the vector load.
*/
if (cols_remaining < 8) {
memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
inptr = tmp_buf;
}
#if RGB_PIXELSIZE == 4
uint8x8x4_t input_pixels = vld4_u8(inptr);
#else
uint8x8x3_t input_pixels = vld3_u8(inptr);
#endif
uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]);
uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]);
uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]);
/* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
uint32x4_t y_low = vmull_lane_u16(vget_low_u16(r), consts.val[0], 0);
y_low = vmlal_lane_u16(y_low, vget_low_u16(g), consts.val[0], 1);
y_low = vmlal_lane_u16(y_low, vget_low_u16(b), consts.val[0], 2);
uint32x4_t y_high = vmull_lane_u16(vget_high_u16(r), consts.val[0], 0);
y_high = vmlal_lane_u16(y_high, vget_high_u16(g), consts.val[0], 1);
y_high = vmlal_lane_u16(y_high, vget_high_u16(b), consts.val[0], 2);
/* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
uint32x4_t cb_low = scaled_128_5;
cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(r), consts.val[0], 3);
cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(g), consts.val[1], 0);
cb_low = vmlal_lane_u16(cb_low, vget_low_u16(b), consts.val[1], 1);
uint32x4_t cb_high = scaled_128_5;
cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(r), consts.val[0], 3);
cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(g), consts.val[1], 0);
cb_high = vmlal_lane_u16(cb_high, vget_high_u16(b), consts.val[1], 1);
/* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
uint32x4_t cr_low = scaled_128_5;
cr_low = vmlal_lane_u16(cr_low, vget_low_u16(r), consts.val[1], 1);
cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(g), consts.val[1], 2);
cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(b), consts.val[1], 3);
uint32x4_t cr_high = scaled_128_5;
cr_high = vmlal_lane_u16(cr_high, vget_high_u16(r), consts.val[1], 1);
cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(g), consts.val[1], 2);
cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(b), consts.val[1], 3);
/* Descale Y values (rounding right shift) and narrow to 16-bit. */
uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_low, 16),
vrshrn_n_u32(y_high, 16));
/* Descale Cb values (right shift) and narrow to 16-bit. */
uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_low, 16),
vshrn_n_u32(cb_high, 16));
/* Descale Cr values (right shift) and narrow to 16-bit. */
uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_low, 16),
vshrn_n_u32(cr_high, 16));
/* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
* overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
*/
vst1_u8(outptr0, vmovn_u16(y_u16));
vst1_u8(outptr1, vmovn_u16(cb_u16));
vst1_u8(outptr2, vmovn_u16(cr_u16));
/* Increment pointers. */
inptr += (8 * RGB_PIXELSIZE);
outptr0 += 8;
outptr1 += 8;
outptr2 += 8;
}
}
}

@ -0,0 +1,334 @@
/*
* jchuff-neon.c - Huffman entropy encoding (32-bit Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*
* NOTE: All referenced figures are from
* Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
*/
#define JPEG_INTERNALS
#include "../../../jinclude.h"
#include "../../../jpeglib.h"
#include "../../../jsimd.h"
#include "../../../jdct.h"
#include "../../../jsimddct.h"
#include "../../jsimd.h"
#include "../jchuff.h"
#include "neon-compat.h"
#include <limits.h>
#include <arm_neon.h>
JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
JCOEFPTR block, int last_dc_val,
c_derived_tbl *dctbl,
c_derived_tbl *actbl)
{
uint8_t block_nbits[DCTSIZE2];
uint16_t block_diff[DCTSIZE2];
/* Load rows of coefficients from DCT block in zig-zag order. */
/* Compute DC coefficient difference value. (F.1.1.5.1) */
int16x8_t row0 = vdupq_n_s16(block[0] - last_dc_val);
row0 = vld1q_lane_s16(block + 1, row0, 1);
row0 = vld1q_lane_s16(block + 8, row0, 2);
row0 = vld1q_lane_s16(block + 16, row0, 3);
row0 = vld1q_lane_s16(block + 9, row0, 4);
row0 = vld1q_lane_s16(block + 2, row0, 5);
row0 = vld1q_lane_s16(block + 3, row0, 6);
row0 = vld1q_lane_s16(block + 10, row0, 7);
int16x8_t row1 = vld1q_dup_s16(block + 17);
row1 = vld1q_lane_s16(block + 24, row1, 1);
row1 = vld1q_lane_s16(block + 32, row1, 2);
row1 = vld1q_lane_s16(block + 25, row1, 3);
row1 = vld1q_lane_s16(block + 18, row1, 4);
row1 = vld1q_lane_s16(block + 11, row1, 5);
row1 = vld1q_lane_s16(block + 4, row1, 6);
row1 = vld1q_lane_s16(block + 5, row1, 7);
int16x8_t row2 = vld1q_dup_s16(block + 12);
row2 = vld1q_lane_s16(block + 19, row2, 1);
row2 = vld1q_lane_s16(block + 26, row2, 2);
row2 = vld1q_lane_s16(block + 33, row2, 3);
row2 = vld1q_lane_s16(block + 40, row2, 4);
row2 = vld1q_lane_s16(block + 48, row2, 5);
row2 = vld1q_lane_s16(block + 41, row2, 6);
row2 = vld1q_lane_s16(block + 34, row2, 7);
int16x8_t row3 = vld1q_dup_s16(block + 27);
row3 = vld1q_lane_s16(block + 20, row3, 1);
row3 = vld1q_lane_s16(block + 13, row3, 2);
row3 = vld1q_lane_s16(block + 6, row3, 3);
row3 = vld1q_lane_s16(block + 7, row3, 4);
row3 = vld1q_lane_s16(block + 14, row3, 5);
row3 = vld1q_lane_s16(block + 21, row3, 6);
row3 = vld1q_lane_s16(block + 28, row3, 7);
int16x8_t abs_row0 = vabsq_s16(row0);
int16x8_t abs_row1 = vabsq_s16(row1);
int16x8_t abs_row2 = vabsq_s16(row2);
int16x8_t abs_row3 = vabsq_s16(row3);
int16x8_t row0_lz = vclzq_s16(abs_row0);
int16x8_t row1_lz = vclzq_s16(abs_row1);
int16x8_t row2_lz = vclzq_s16(abs_row2);
int16x8_t row3_lz = vclzq_s16(abs_row3);
/* Compute number of bits required to represent each coefficient. */
uint8x8_t row0_nbits = vsub_u8(vdup_n_u8(16),
vmovn_u16(vreinterpretq_u16_s16(row0_lz)));
uint8x8_t row1_nbits = vsub_u8(vdup_n_u8(16),
vmovn_u16(vreinterpretq_u16_s16(row1_lz)));
uint8x8_t row2_nbits = vsub_u8(vdup_n_u8(16),
vmovn_u16(vreinterpretq_u16_s16(row2_lz)));
uint8x8_t row3_nbits = vsub_u8(vdup_n_u8(16),
vmovn_u16(vreinterpretq_u16_s16(row3_lz)));
vst1_u8(block_nbits + 0 * DCTSIZE, row0_nbits);
vst1_u8(block_nbits + 1 * DCTSIZE, row1_nbits);
vst1_u8(block_nbits + 2 * DCTSIZE, row2_nbits);
vst1_u8(block_nbits + 3 * DCTSIZE, row3_nbits);
uint16x8_t row0_mask =
vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row0, 15)),
vnegq_s16(row0_lz));
uint16x8_t row1_mask =
vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row1, 15)),
vnegq_s16(row1_lz));
uint16x8_t row2_mask =
vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row2, 15)),
vnegq_s16(row2_lz));
uint16x8_t row3_mask =
vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row3, 15)),
vnegq_s16(row3_lz));
uint16x8_t row0_diff = veorq_u16(vreinterpretq_u16_s16(abs_row0), row0_mask);
uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1), row1_mask);
uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2), row2_mask);
uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3), row3_mask);
/* Store diff values for rows 0, 1, 2, and 3. */
vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
/* Load last four rows of coefficients from DCT block in zig-zag order. */
int16x8_t row4 = vld1q_dup_s16(block + 35);
row4 = vld1q_lane_s16(block + 42, row4, 1);
row4 = vld1q_lane_s16(block + 49, row4, 2);
row4 = vld1q_lane_s16(block + 56, row4, 3);
row4 = vld1q_lane_s16(block + 57, row4, 4);
row4 = vld1q_lane_s16(block + 50, row4, 5);
row4 = vld1q_lane_s16(block + 43, row4, 6);
row4 = vld1q_lane_s16(block + 36, row4, 7);
int16x8_t row5 = vld1q_dup_s16(block + 29);
row5 = vld1q_lane_s16(block + 22, row5, 1);
row5 = vld1q_lane_s16(block + 15, row5, 2);
row5 = vld1q_lane_s16(block + 23, row5, 3);
row5 = vld1q_lane_s16(block + 30, row5, 4);
row5 = vld1q_lane_s16(block + 37, row5, 5);
row5 = vld1q_lane_s16(block + 44, row5, 6);
row5 = vld1q_lane_s16(block + 51, row5, 7);
int16x8_t row6 = vld1q_dup_s16(block + 58);
row6 = vld1q_lane_s16(block + 59, row6, 1);
row6 = vld1q_lane_s16(block + 52, row6, 2);
row6 = vld1q_lane_s16(block + 45, row6, 3);
row6 = vld1q_lane_s16(block + 38, row6, 4);
row6 = vld1q_lane_s16(block + 31, row6, 5);
row6 = vld1q_lane_s16(block + 39, row6, 6);
row6 = vld1q_lane_s16(block + 46, row6, 7);
int16x8_t row7 = vld1q_dup_s16(block + 53);
row7 = vld1q_lane_s16(block + 60, row7, 1);
row7 = vld1q_lane_s16(block + 61, row7, 2);
row7 = vld1q_lane_s16(block + 54, row7, 3);
row7 = vld1q_lane_s16(block + 47, row7, 4);
row7 = vld1q_lane_s16(block + 55, row7, 5);
row7 = vld1q_lane_s16(block + 62, row7, 6);
row7 = vld1q_lane_s16(block + 63, row7, 7);
int16x8_t abs_row4 = vabsq_s16(row4);
int16x8_t abs_row5 = vabsq_s16(row5);
int16x8_t abs_row6 = vabsq_s16(row6);
int16x8_t abs_row7 = vabsq_s16(row7);
int16x8_t row4_lz = vclzq_s16(abs_row4);
int16x8_t row5_lz = vclzq_s16(abs_row5);
int16x8_t row6_lz = vclzq_s16(abs_row6);
int16x8_t row7_lz = vclzq_s16(abs_row7);
/* Compute number of bits required to represent each coefficient. */
uint8x8_t row4_nbits = vsub_u8(vdup_n_u8(16),
vmovn_u16(vreinterpretq_u16_s16(row4_lz)));
uint8x8_t row5_nbits = vsub_u8(vdup_n_u8(16),
vmovn_u16(vreinterpretq_u16_s16(row5_lz)));
uint8x8_t row6_nbits = vsub_u8(vdup_n_u8(16),
vmovn_u16(vreinterpretq_u16_s16(row6_lz)));
uint8x8_t row7_nbits = vsub_u8(vdup_n_u8(16),
vmovn_u16(vreinterpretq_u16_s16(row7_lz)));
vst1_u8(block_nbits + 4 * DCTSIZE, row4_nbits);
vst1_u8(block_nbits + 5 * DCTSIZE, row5_nbits);
vst1_u8(block_nbits + 6 * DCTSIZE, row6_nbits);
vst1_u8(block_nbits + 7 * DCTSIZE, row7_nbits);
uint16x8_t row4_mask =
vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row4, 15)),
vnegq_s16(row4_lz));
uint16x8_t row5_mask =
vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row5, 15)),
vnegq_s16(row5_lz));
uint16x8_t row6_mask =
vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row6, 15)),
vnegq_s16(row6_lz));
uint16x8_t row7_mask =
vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row7, 15)),
vnegq_s16(row7_lz));
uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4), row4_mask);
uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5), row5_mask);
uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6), row6_mask);
uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7), row7_mask);
/* Store diff values for rows 4, 5, 6, and 7. */
vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
/* Construct bitmap to accelerate encoding of AC coefficients. A set bit
* means that the corresponding coefficient != 0.
*/
uint8x8_t row0_nbits_gt0 = vcgt_u8(row0_nbits, vdup_n_u8(0));
uint8x8_t row1_nbits_gt0 = vcgt_u8(row1_nbits, vdup_n_u8(0));
uint8x8_t row2_nbits_gt0 = vcgt_u8(row2_nbits, vdup_n_u8(0));
uint8x8_t row3_nbits_gt0 = vcgt_u8(row3_nbits, vdup_n_u8(0));
uint8x8_t row4_nbits_gt0 = vcgt_u8(row4_nbits, vdup_n_u8(0));
uint8x8_t row5_nbits_gt0 = vcgt_u8(row5_nbits, vdup_n_u8(0));
uint8x8_t row6_nbits_gt0 = vcgt_u8(row6_nbits, vdup_n_u8(0));
uint8x8_t row7_nbits_gt0 = vcgt_u8(row7_nbits, vdup_n_u8(0));
/* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
const uint8x8_t bitmap_mask =
vreinterpret_u8_u64(vmov_n_u64(0x0102040810204080));
row0_nbits_gt0 = vand_u8(row0_nbits_gt0, bitmap_mask);
row1_nbits_gt0 = vand_u8(row1_nbits_gt0, bitmap_mask);
row2_nbits_gt0 = vand_u8(row2_nbits_gt0, bitmap_mask);
row3_nbits_gt0 = vand_u8(row3_nbits_gt0, bitmap_mask);
row4_nbits_gt0 = vand_u8(row4_nbits_gt0, bitmap_mask);
row5_nbits_gt0 = vand_u8(row5_nbits_gt0, bitmap_mask);
row6_nbits_gt0 = vand_u8(row6_nbits_gt0, bitmap_mask);
row7_nbits_gt0 = vand_u8(row7_nbits_gt0, bitmap_mask);
uint8x8_t bitmap_rows_10 = vpadd_u8(row1_nbits_gt0, row0_nbits_gt0);
uint8x8_t bitmap_rows_32 = vpadd_u8(row3_nbits_gt0, row2_nbits_gt0);
uint8x8_t bitmap_rows_54 = vpadd_u8(row5_nbits_gt0, row4_nbits_gt0);
uint8x8_t bitmap_rows_76 = vpadd_u8(row7_nbits_gt0, row6_nbits_gt0);
uint8x8_t bitmap_rows_3210 = vpadd_u8(bitmap_rows_32, bitmap_rows_10);
uint8x8_t bitmap_rows_7654 = vpadd_u8(bitmap_rows_76, bitmap_rows_54);
uint8x8_t bitmap = vpadd_u8(bitmap_rows_7654, bitmap_rows_3210);
/* Shift left to remove DC bit. */
bitmap = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(bitmap), 1));
/* Move bitmap to 32-bit scalar registers. */
uint32_t bitmap_1_32 = vget_lane_u32(vreinterpret_u32_u8(bitmap), 1);
uint32_t bitmap_33_63 = vget_lane_u32(vreinterpret_u32_u8(bitmap), 0);
/* Set up state and bit buffer for output bitstream. */
working_state *state_ptr = (working_state *)state;
int free_bits = state_ptr->cur.free_bits;
size_t put_buffer = state_ptr->cur.put_buffer;
/* Encode DC coefficient. */
unsigned int nbits = block_nbits[0];
/* Emit Huffman-coded symbol and additional diff bits. */
unsigned int diff = block_diff[0];
PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits], diff)
/* Encode AC coefficients. */
unsigned int r = 0; /* r = run length of zeros */
unsigned int i = 1; /* i = number of coefficients encoded */
/* Code and size information for a run length of 16 zero coefficients */
const unsigned int code_0xf0 = actbl->ehufco[0xf0];
const unsigned int size_0xf0 = actbl->ehufsi[0xf0];
while (bitmap_1_32 != 0) {
r = BUILTIN_CLZ(bitmap_1_32);
i += r;
bitmap_1_32 <<= r;
nbits = block_nbits[i];
diff = block_diff[i];
while (r > 15) {
/* If run length > 15, emit special run-length-16 codes. */
PUT_BITS(code_0xf0, size_0xf0)
r -= 16;
}
/* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
unsigned int rs = (r << 4) + nbits;
PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
i++;
bitmap_1_32 <<= 1;
}
r = 33 - i;
i = 33;
while (bitmap_33_63 != 0) {
unsigned int leading_zeros = BUILTIN_CLZ(bitmap_33_63);
r += leading_zeros;
i += leading_zeros;
bitmap_33_63 <<= leading_zeros;
nbits = block_nbits[i];
diff = block_diff[i];
while (r > 15) {
/* If run length > 15, emit special run-length-16 codes. */
PUT_BITS(code_0xf0, size_0xf0)
r -= 16;
}
/* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
unsigned int rs = (r << 4) + nbits;
PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
r = 0;
i++;
bitmap_33_63 <<= 1;
}
/* If the last coefficient(s) were zero, emit an end-of-block (EOB) code.
* The value of RS for the EOB code is 0.
*/
if (i != 64) {
PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
}
state_ptr->cur.put_buffer = put_buffer;
state_ptr->cur.free_bits = free_bits;
return buffer;
}

@ -0,0 +1,980 @@
/*
* jsimd_arm.c
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
* Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2022, D. R. Commander.
* Copyright (C) 2015-2016, 2018, Matthieu Darbois.
* Copyright (C) 2019, Google LLC.
* Copyright (C) 2020, Arm Limited.
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
* For conditions of distribution and use, see copyright notice in jsimdext.inc
*
* This file contains the interface between the "normal" portions
* of the library and the SIMD implementations when running on a
* 32-bit Arm architecture.
*/
#define JPEG_INTERNALS
#include "../../../jinclude.h"
#include "../../../jpeglib.h"
#include "../../../jsimd.h"
#include "../../../jdct.h"
#include "../../../jsimddct.h"
#include "../../jsimd.h"
#include <stdio.h>
#include <string.h>
#include <ctype.h>
static unsigned int simd_support = ~0;
static unsigned int simd_huffman = 1;
#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
LOCAL(int)
check_feature(char *buffer, char *feature)
{
char *p;
if (*feature == 0)
return 0;
if (strncmp(buffer, "Features", 8) != 0)
return 0;
buffer += 8;
while (isspace(*buffer))
buffer++;
/* Check if 'feature' is present in the buffer as a separate word */
while ((p = strstr(buffer, feature))) {
if (p > buffer && !isspace(*(p - 1))) {
buffer++;
continue;
}
p += strlen(feature);
if (*p != 0 && !isspace(*p)) {
buffer++;
continue;
}
return 1;
}
return 0;
}
LOCAL(int)
parse_proc_cpuinfo(int bufsize)
{
char *buffer = (char *)malloc(bufsize);
FILE *fd;
simd_support = 0;
if (!buffer)
return 0;
fd = fopen("/proc/cpuinfo", "r");
if (fd) {
while (fgets(buffer, bufsize, fd)) {
if (!strchr(buffer, '\n') && !feof(fd)) {
/* "impossible" happened - insufficient size of the buffer! */
fclose(fd);
free(buffer);
return 0;
}
if (check_feature(buffer, "neon"))
simd_support |= JSIMD_NEON;
}
fclose(fd);
}
free(buffer);
return 1;
}
#endif
/*
* Check what SIMD accelerations are supported.
*
* FIXME: This code is racy under a multi-threaded environment.
*/
LOCAL(void)
init_simd(void)
{
#ifndef NO_GETENV
char env[2] = { 0 };
#endif
#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
int bufsize = 1024; /* an initial guess for the line buffer size limit */
#endif
if (simd_support != ~0U)
return;
simd_support = 0;
#if defined(__ARM_NEON__)
simd_support |= JSIMD_NEON;
#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
/* We still have a chance to use Neon regardless of globally used
* -mcpu/-mfpu options passed to gcc by performing runtime detection via
* /proc/cpuinfo parsing on linux/android */
while (!parse_proc_cpuinfo(bufsize)) {
bufsize *= 2;
if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
break;
}
#endif
#ifndef NO_GETENV
/* Force different settings through environment variables */
if (!GETENV_S(env, 2, "JSIMD_FORCENEON") && !strcmp(env, "1"))
simd_support = JSIMD_NEON;
if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
simd_support = 0;
if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
simd_huffman = 0;
#endif
}
GLOBAL(int)
jsimd_can_rgb_ycc(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_rgb_gray(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_ycc_rgb(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_ycc_rgb565(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(void)
jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
JSAMPIMAGE output_buf, JDIMENSION output_row,
int num_rows)
{
void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
switch (cinfo->in_color_space) {
case JCS_EXT_RGB:
neonfct = jsimd_extrgb_ycc_convert_neon;
break;
case JCS_EXT_RGBX:
case JCS_EXT_RGBA:
neonfct = jsimd_extrgbx_ycc_convert_neon;
break;
case JCS_EXT_BGR:
neonfct = jsimd_extbgr_ycc_convert_neon;
break;
case JCS_EXT_BGRX:
case JCS_EXT_BGRA:
neonfct = jsimd_extbgrx_ycc_convert_neon;
break;
case JCS_EXT_XBGR:
case JCS_EXT_ABGR:
neonfct = jsimd_extxbgr_ycc_convert_neon;
break;
case JCS_EXT_XRGB:
case JCS_EXT_ARGB:
neonfct = jsimd_extxrgb_ycc_convert_neon;
break;
default:
neonfct = jsimd_extrgb_ycc_convert_neon;
break;
}
neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
}
GLOBAL(void)
jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
JSAMPIMAGE output_buf, JDIMENSION output_row,
int num_rows)
{
void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
switch (cinfo->in_color_space) {
case JCS_EXT_RGB:
neonfct = jsimd_extrgb_gray_convert_neon;
break;
case JCS_EXT_RGBX:
case JCS_EXT_RGBA:
neonfct = jsimd_extrgbx_gray_convert_neon;
break;
case JCS_EXT_BGR:
neonfct = jsimd_extbgr_gray_convert_neon;
break;
case JCS_EXT_BGRX:
case JCS_EXT_BGRA:
neonfct = jsimd_extbgrx_gray_convert_neon;
break;
case JCS_EXT_XBGR:
case JCS_EXT_ABGR:
neonfct = jsimd_extxbgr_gray_convert_neon;
break;
case JCS_EXT_XRGB:
case JCS_EXT_ARGB:
neonfct = jsimd_extxrgb_gray_convert_neon;
break;
default:
neonfct = jsimd_extrgb_gray_convert_neon;
break;
}
neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
}
GLOBAL(void)
jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
JDIMENSION input_row, JSAMPARRAY output_buf,
int num_rows)
{
void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
switch (cinfo->out_color_space) {
case JCS_EXT_RGB:
neonfct = jsimd_ycc_extrgb_convert_neon;
break;
case JCS_EXT_RGBX:
case JCS_EXT_RGBA:
neonfct = jsimd_ycc_extrgbx_convert_neon;
break;
case JCS_EXT_BGR:
neonfct = jsimd_ycc_extbgr_convert_neon;
break;
case JCS_EXT_BGRX:
case JCS_EXT_BGRA:
neonfct = jsimd_ycc_extbgrx_convert_neon;
break;
case JCS_EXT_XBGR:
case JCS_EXT_ABGR:
neonfct = jsimd_ycc_extxbgr_convert_neon;
break;
case JCS_EXT_XRGB:
case JCS_EXT_ARGB:
neonfct = jsimd_ycc_extxrgb_convert_neon;
break;
default:
neonfct = jsimd_ycc_extrgb_convert_neon;
break;
}
neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
}
GLOBAL(void)
jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
JDIMENSION input_row, JSAMPARRAY output_buf,
int num_rows)
{
jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
output_buf, num_rows);
}
GLOBAL(int)
jsimd_can_h2v2_downsample(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (DCTSIZE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_h2v1_downsample(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (DCTSIZE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(void)
jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY output_data)
{
jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
compptr->v_samp_factor, compptr->width_in_blocks,
input_data, output_data);
}
GLOBAL(void)
jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY output_data)
{
jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
compptr->v_samp_factor, compptr->width_in_blocks,
input_data, output_data);
}
GLOBAL(int)
jsimd_can_h2v2_upsample(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_h2v1_upsample(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(void)
jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
input_data, output_data_ptr);
}
GLOBAL(void)
jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
input_data, output_data_ptr);
}
GLOBAL(int)
jsimd_can_h2v2_fancy_upsample(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_h2v1_fancy_upsample(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_h1v2_fancy_upsample(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(void)
jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
jsimd_h2v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
compptr->downsampled_width, input_data,
output_data_ptr);
}
GLOBAL(void)
jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
compptr->downsampled_width, input_data,
output_data_ptr);
}
GLOBAL(void)
jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
compptr->downsampled_width, input_data,
output_data_ptr);
}
GLOBAL(int)
jsimd_can_h2v2_merged_upsample(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_h2v1_merged_upsample(void)
{
init_simd();
/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(void)
jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
{
void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
switch (cinfo->out_color_space) {
case JCS_EXT_RGB:
neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
break;
case JCS_EXT_RGBX:
case JCS_EXT_RGBA:
neonfct = jsimd_h2v2_extrgbx_merged_upsample_neon;
break;
case JCS_EXT_BGR:
neonfct = jsimd_h2v2_extbgr_merged_upsample_neon;
break;
case JCS_EXT_BGRX:
case JCS_EXT_BGRA:
neonfct = jsimd_h2v2_extbgrx_merged_upsample_neon;
break;
case JCS_EXT_XBGR:
case JCS_EXT_ABGR:
neonfct = jsimd_h2v2_extxbgr_merged_upsample_neon;
break;
case JCS_EXT_XRGB:
case JCS_EXT_ARGB:
neonfct = jsimd_h2v2_extxrgb_merged_upsample_neon;
break;
default:
neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
break;
}
neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
}
GLOBAL(void)
jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
{
void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
switch (cinfo->out_color_space) {
case JCS_EXT_RGB:
neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
break;
case JCS_EXT_RGBX:
case JCS_EXT_RGBA:
neonfct = jsimd_h2v1_extrgbx_merged_upsample_neon;
break;
case JCS_EXT_BGR:
neonfct = jsimd_h2v1_extbgr_merged_upsample_neon;
break;
case JCS_EXT_BGRX:
case JCS_EXT_BGRA:
neonfct = jsimd_h2v1_extbgrx_merged_upsample_neon;
break;
case JCS_EXT_XBGR:
case JCS_EXT_ABGR:
neonfct = jsimd_h2v1_extxbgr_merged_upsample_neon;
break;
case JCS_EXT_XRGB:
case JCS_EXT_ARGB:
neonfct = jsimd_h2v1_extxrgb_merged_upsample_neon;
break;
default:
neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
break;
}
neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
}
GLOBAL(int)
jsimd_can_convsamp(void)
{
init_simd();
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (sizeof(DCTELEM) != 2)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_convsamp_float(void)
{
return 0;
}
GLOBAL(void)
jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
DCTELEM *workspace)
{
jsimd_convsamp_neon(sample_data, start_col, workspace);
}
GLOBAL(void)
jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
FAST_FLOAT *workspace)
{
}
GLOBAL(int)
jsimd_can_fdct_islow(void)
{
init_simd();
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
if (sizeof(DCTELEM) != 2)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_fdct_ifast(void)
{
init_simd();
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
if (sizeof(DCTELEM) != 2)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_fdct_float(void)
{
return 0;
}
GLOBAL(void)
jsimd_fdct_islow(DCTELEM *data)
{
jsimd_fdct_islow_neon(data);
}
GLOBAL(void)
jsimd_fdct_ifast(DCTELEM *data)
{
jsimd_fdct_ifast_neon(data);
}
GLOBAL(void)
jsimd_fdct_float(FAST_FLOAT *data)
{
}
GLOBAL(int)
jsimd_can_quantize(void)
{
init_simd();
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
if (sizeof(JCOEF) != 2)
return 0;
if (sizeof(DCTELEM) != 2)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_quantize_float(void)
{
return 0;
}
GLOBAL(void)
jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
{
jsimd_quantize_neon(coef_block, divisors, workspace);
}
GLOBAL(void)
jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
FAST_FLOAT *workspace)
{
}
GLOBAL(int)
jsimd_can_idct_2x2(void)
{
init_simd();
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
if (sizeof(JCOEF) != 2)
return 0;
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (sizeof(ISLOW_MULT_TYPE) != 2)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_idct_4x4(void)
{
init_simd();
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
if (sizeof(JCOEF) != 2)
return 0;
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (sizeof(ISLOW_MULT_TYPE) != 2)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(void)
jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, output_col);
}
GLOBAL(void)
jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, output_col);
}
GLOBAL(int)
jsimd_can_idct_islow(void)
{
init_simd();
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
if (sizeof(JCOEF) != 2)
return 0;
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (sizeof(ISLOW_MULT_TYPE) != 2)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_idct_ifast(void)
{
init_simd();
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
if (sizeof(JCOEF) != 2)
return 0;
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
if (sizeof(IFAST_MULT_TYPE) != 2)
return 0;
if (IFAST_SCALE_BITS != 2)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(int)
jsimd_can_idct_float(void)
{
return 0;
}
GLOBAL(void)
jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
output_col);
}
GLOBAL(void)
jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
output_col);
}
GLOBAL(void)
jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
}
GLOBAL(int)
jsimd_can_huff_encode_one_block(void)
{
init_simd();
if (DCTSIZE != 8)
return 0;
if (sizeof(JCOEF) != 2)
return 0;
if (simd_support & JSIMD_NEON && simd_huffman)
return 1;
return 0;
}
GLOBAL(JOCTET *)
jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
int last_dc_val, c_derived_tbl *dctbl,
c_derived_tbl *actbl)
{
return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
dctbl, actbl);
}
GLOBAL(int)
jsimd_can_encode_mcu_AC_first_prepare(void)
{
init_simd();
if (DCTSIZE != 8)
return 0;
if (sizeof(JCOEF) != 2)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(void)
jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
int Al, JCOEF *values, size_t *zerobits)
{
jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
Sl, Al, values, zerobits);
}
GLOBAL(int)
jsimd_can_encode_mcu_AC_refine_prepare(void)
{
init_simd();
if (DCTSIZE != 8)
return 0;
if (sizeof(JCOEF) != 2)
return 0;
if (simd_support & JSIMD_NEON)
return 1;
return 0;
}
GLOBAL(int)
jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
int Al, JCOEF *absvalues, size_t *bits)
{
return jsimd_encode_mcu_AC_refine_prepare_neon(block,
jpeg_natural_order_start, Sl,
Al, absvalues, bits);
}

File diff suppressed because it is too large Load Diff

@ -0,0 +1,316 @@
/*
* jccolext-neon.c - colorspace conversion (64-bit Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* This file is included by jccolor-neon.c */
/* RGB -> YCbCr conversion is defined by the following equations:
* Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
* Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128
* Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128
*
* Avoid floating point arithmetic by using shifted integer constants:
* 0.29899597 = 19595 * 2^-16
* 0.58700561 = 38470 * 2^-16
* 0.11399841 = 7471 * 2^-16
* 0.16874695 = 11059 * 2^-16
* 0.33125305 = 21709 * 2^-16
* 0.50000000 = 32768 * 2^-16
* 0.41868592 = 27439 * 2^-16
* 0.08131409 = 5329 * 2^-16
* These constants are defined in jccolor-neon.c
*
* We add the fixed-point equivalent of 0.5 to Cb and Cr, which effectively
* rounds up or down the result via integer truncation.
*/
void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
JSAMPIMAGE output_buf, JDIMENSION output_row,
int num_rows)
{
/* Pointer to RGB(X/A) input data */
JSAMPROW inptr;
/* Pointers to Y, Cb, and Cr output data */
JSAMPROW outptr0, outptr1, outptr2;
/* Allocate temporary buffer for final (image_width % 16) pixels in row. */
ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
/* Set up conversion constants. */
const uint16x8_t consts = vld1q_u16(jsimd_rgb_ycc_neon_consts);
const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
while (--num_rows >= 0) {
inptr = *input_buf++;
outptr0 = output_buf[0][output_row];
outptr1 = output_buf[1][output_row];
outptr2 = output_buf[2][output_row];
output_row++;
int cols_remaining = image_width;
for (; cols_remaining >= 16; cols_remaining -= 16) {
#if RGB_PIXELSIZE == 4
uint8x16x4_t input_pixels = vld4q_u8(inptr);
#else
uint8x16x3_t input_pixels = vld3q_u8(inptr);
#endif
uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
/* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
/* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
uint32x4_t cb_ll = scaled_128_5;
cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
uint32x4_t cb_lh = scaled_128_5;
cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
uint32x4_t cb_hl = scaled_128_5;
cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
uint32x4_t cb_hh = scaled_128_5;
cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
/* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
uint32x4_t cr_ll = scaled_128_5;
cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
uint32x4_t cr_lh = scaled_128_5;
cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
uint32x4_t cr_hl = scaled_128_5;
cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
uint32x4_t cr_hh = scaled_128_5;
cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
/* Descale Y values (rounding right shift) and narrow to 16-bit. */
uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
vrshrn_n_u32(y_lh, 16));
uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
vrshrn_n_u32(y_hh, 16));
/* Descale Cb values (right shift) and narrow to 16-bit. */
uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
vshrn_n_u32(cb_lh, 16));
uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
vshrn_n_u32(cb_hh, 16));
/* Descale Cr values (right shift) and narrow to 16-bit. */
uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
vshrn_n_u32(cr_lh, 16));
uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
vshrn_n_u32(cr_hh, 16));
/* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
* overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
*/
vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
/* Increment pointers. */
inptr += (16 * RGB_PIXELSIZE);
outptr0 += 16;
outptr1 += 16;
outptr2 += 16;
}
if (cols_remaining > 8) {
/* To prevent buffer overread by the vector load instructions, the last
* (image_width % 16) columns of data are first memcopied to a temporary
* buffer large enough to accommodate the vector load.
*/
memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
inptr = tmp_buf;
#if RGB_PIXELSIZE == 4
uint8x16x4_t input_pixels = vld4q_u8(inptr);
#else
uint8x16x3_t input_pixels = vld3q_u8(inptr);
#endif
uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
/* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
/* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
uint32x4_t cb_ll = scaled_128_5;
cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
uint32x4_t cb_lh = scaled_128_5;
cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
uint32x4_t cb_hl = scaled_128_5;
cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
uint32x4_t cb_hh = scaled_128_5;
cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
/* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
uint32x4_t cr_ll = scaled_128_5;
cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
uint32x4_t cr_lh = scaled_128_5;
cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
uint32x4_t cr_hl = scaled_128_5;
cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
uint32x4_t cr_hh = scaled_128_5;
cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
/* Descale Y values (rounding right shift) and narrow to 16-bit. */
uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
vrshrn_n_u32(y_lh, 16));
uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
vrshrn_n_u32(y_hh, 16));
/* Descale Cb values (right shift) and narrow to 16-bit. */
uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
vshrn_n_u32(cb_lh, 16));
uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
vshrn_n_u32(cb_hh, 16));
/* Descale Cr values (right shift) and narrow to 16-bit. */
uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
vshrn_n_u32(cr_lh, 16));
uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
vshrn_n_u32(cr_hh, 16));
/* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
* overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
*/
vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
} else if (cols_remaining > 0) {
/* To prevent buffer overread by the vector load instructions, the last
* (image_width % 8) columns of data are first memcopied to a temporary
* buffer large enough to accommodate the vector load.
*/
memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
inptr = tmp_buf;
#if RGB_PIXELSIZE == 4
uint8x8x4_t input_pixels = vld4_u8(inptr);
#else
uint8x8x3_t input_pixels = vld3_u8(inptr);
#endif
uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]);
uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]);
uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]);
/* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
uint32x4_t y_l = vmull_laneq_u16(vget_low_u16(r), consts, 0);
y_l = vmlal_laneq_u16(y_l, vget_low_u16(g), consts, 1);
y_l = vmlal_laneq_u16(y_l, vget_low_u16(b), consts, 2);
uint32x4_t y_h = vmull_laneq_u16(vget_high_u16(r), consts, 0);
y_h = vmlal_laneq_u16(y_h, vget_high_u16(g), consts, 1);
y_h = vmlal_laneq_u16(y_h, vget_high_u16(b), consts, 2);
/* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
uint32x4_t cb_l = scaled_128_5;
cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(r), consts, 3);
cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(g), consts, 4);
cb_l = vmlal_laneq_u16(cb_l, vget_low_u16(b), consts, 5);
uint32x4_t cb_h = scaled_128_5;
cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(r), consts, 3);
cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(g), consts, 4);
cb_h = vmlal_laneq_u16(cb_h, vget_high_u16(b), consts, 5);
/* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
uint32x4_t cr_l = scaled_128_5;
cr_l = vmlal_laneq_u16(cr_l, vget_low_u16(r), consts, 5);
cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(g), consts, 6);
cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(b), consts, 7);
uint32x4_t cr_h = scaled_128_5;
cr_h = vmlal_laneq_u16(cr_h, vget_high_u16(r), consts, 5);
cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(g), consts, 6);
cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(b), consts, 7);
/* Descale Y values (rounding right shift) and narrow to 16-bit. */
uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_l, 16),
vrshrn_n_u32(y_h, 16));
/* Descale Cb values (right shift) and narrow to 16-bit. */
uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_l, 16),
vshrn_n_u32(cb_h, 16));
/* Descale Cr values (right shift) and narrow to 16-bit. */
uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_l, 16),
vshrn_n_u32(cr_h, 16));
/* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
* overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
*/
vst1_u8(outptr0, vmovn_u16(y_u16));
vst1_u8(outptr1, vmovn_u16(cb_u16));
vst1_u8(outptr2, vmovn_u16(cr_u16));
}
}
}

@ -0,0 +1,411 @@
/*
* jchuff-neon.c - Huffman entropy encoding (64-bit Arm Neon)
*
* Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
* Copyright (C) 2020, 2022, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*
* NOTE: All referenced figures are from
* Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
*/
#define JPEG_INTERNALS
#include "../../../jinclude.h"
#include "../../../jpeglib.h"
#include "../../../jsimd.h"
#include "../../../jdct.h"
#include "../../../jsimddct.h"
#include "../../jsimd.h"
#include "../align.h"
#include "../jchuff.h"
#include "neon-compat.h"
#include <limits.h>
#include <arm_neon.h>
ALIGN(16) static const uint8_t jsimd_huff_encode_one_block_consts[] = {
0, 1, 2, 3, 16, 17, 32, 33,
18, 19, 4, 5, 6, 7, 20, 21,
34, 35, 48, 49, 255, 255, 50, 51,
36, 37, 22, 23, 8, 9, 10, 11,
255, 255, 6, 7, 20, 21, 34, 35,
48, 49, 255, 255, 50, 51, 36, 37,
54, 55, 40, 41, 26, 27, 12, 13,
14, 15, 28, 29, 42, 43, 56, 57,
6, 7, 20, 21, 34, 35, 48, 49,
50, 51, 36, 37, 22, 23, 8, 9,
26, 27, 12, 13, 255, 255, 14, 15,
28, 29, 42, 43, 56, 57, 255, 255,
52, 53, 54, 55, 40, 41, 26, 27,
12, 13, 255, 255, 14, 15, 28, 29,
26, 27, 40, 41, 42, 43, 28, 29,
14, 15, 30, 31, 44, 45, 46, 47
};
/* The AArch64 implementation of the FLUSH() macro triggers a UBSan misaligned
* address warning because the macro sometimes writes a 64-bit value to a
* non-64-bit-aligned address. That behavior is technically undefined per
* the C specification, but it is supported by the AArch64 architecture and
* compilers.
*/
#if defined(__has_feature)
#if __has_feature(undefined_behavior_sanitizer)
__attribute__((no_sanitize("alignment")))
#endif
#endif
JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
JCOEFPTR block, int last_dc_val,
c_derived_tbl *dctbl,
c_derived_tbl *actbl)
{
uint16_t block_diff[DCTSIZE2];
/* Load lookup table indices for rows of zig-zag ordering. */
#ifdef HAVE_VLD1Q_U8_X4
const uint8x16x4_t idx_rows_0123 =
vld1q_u8_x4(jsimd_huff_encode_one_block_consts + 0 * DCTSIZE);
const uint8x16x4_t idx_rows_4567 =
vld1q_u8_x4(jsimd_huff_encode_one_block_consts + 8 * DCTSIZE);
#else
/* GCC does not currently support intrinsics vl1dq_<type>_x4(). */
const uint8x16x4_t idx_rows_0123 = { {
vld1q_u8(jsimd_huff_encode_one_block_consts + 0 * DCTSIZE),
vld1q_u8(jsimd_huff_encode_one_block_consts + 2 * DCTSIZE),
vld1q_u8(jsimd_huff_encode_one_block_consts + 4 * DCTSIZE),
vld1q_u8(jsimd_huff_encode_one_block_consts + 6 * DCTSIZE)
} };
const uint8x16x4_t idx_rows_4567 = { {
vld1q_u8(jsimd_huff_encode_one_block_consts + 8 * DCTSIZE),
vld1q_u8(jsimd_huff_encode_one_block_consts + 10 * DCTSIZE),
vld1q_u8(jsimd_huff_encode_one_block_consts + 12 * DCTSIZE),
vld1q_u8(jsimd_huff_encode_one_block_consts + 14 * DCTSIZE)
} };
#endif
/* Load 8x8 block of DCT coefficients. */
#ifdef HAVE_VLD1Q_U8_X4
const int8x16x4_t tbl_rows_0123 =
vld1q_s8_x4((int8_t *)(block + 0 * DCTSIZE));
const int8x16x4_t tbl_rows_4567 =
vld1q_s8_x4((int8_t *)(block + 4 * DCTSIZE));
#else
const int8x16x4_t tbl_rows_0123 = { {
vld1q_s8((int8_t *)(block + 0 * DCTSIZE)),
vld1q_s8((int8_t *)(block + 1 * DCTSIZE)),
vld1q_s8((int8_t *)(block + 2 * DCTSIZE)),
vld1q_s8((int8_t *)(block + 3 * DCTSIZE))
} };
const int8x16x4_t tbl_rows_4567 = { {
vld1q_s8((int8_t *)(block + 4 * DCTSIZE)),
vld1q_s8((int8_t *)(block + 5 * DCTSIZE)),
vld1q_s8((int8_t *)(block + 6 * DCTSIZE)),
vld1q_s8((int8_t *)(block + 7 * DCTSIZE))
} };
#endif
/* Initialise extra lookup tables. */
const int8x16x4_t tbl_rows_2345 = { {
tbl_rows_0123.val[2], tbl_rows_0123.val[3],
tbl_rows_4567.val[0], tbl_rows_4567.val[1]
} };
const int8x16x3_t tbl_rows_567 =
{ { tbl_rows_4567.val[1], tbl_rows_4567.val[2], tbl_rows_4567.val[3] } };
/* Shuffle coefficients into zig-zag order. */
int16x8_t row0 =
vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[0]));
int16x8_t row1 =
vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[1]));
int16x8_t row2 =
vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_2345, idx_rows_0123.val[2]));
int16x8_t row3 =
vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[3]));
int16x8_t row4 =
vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_4567, idx_rows_4567.val[0]));
int16x8_t row5 =
vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_2345, idx_rows_4567.val[1]));
int16x8_t row6 =
vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_4567, idx_rows_4567.val[2]));
int16x8_t row7 =
vreinterpretq_s16_s8(vqtbl3q_s8(tbl_rows_567, idx_rows_4567.val[3]));
/* Compute DC coefficient difference value (F.1.1.5.1). */
row0 = vsetq_lane_s16(block[0] - last_dc_val, row0, 0);
/* Initialize AC coefficient lanes not reachable by lookup tables. */
row1 =
vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[0]),
0), row1, 2);
row2 =
vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[1]),
4), row2, 0);
row2 =
vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[2]),
0), row2, 5);
row5 =
vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[1]),
7), row5, 2);
row5 =
vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[2]),
3), row5, 7);
row6 =
vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[3]),
7), row6, 5);
/* DCT block is now in zig-zag order; start Huffman encoding process. */
/* Construct bitmap to accelerate encoding of AC coefficients. A set bit
* means that the corresponding coefficient != 0.
*/
uint16x8_t row0_ne_0 = vtstq_s16(row0, row0);
uint16x8_t row1_ne_0 = vtstq_s16(row1, row1);
uint16x8_t row2_ne_0 = vtstq_s16(row2, row2);
uint16x8_t row3_ne_0 = vtstq_s16(row3, row3);
uint16x8_t row4_ne_0 = vtstq_s16(row4, row4);
uint16x8_t row5_ne_0 = vtstq_s16(row5, row5);
uint16x8_t row6_ne_0 = vtstq_s16(row6, row6);
uint16x8_t row7_ne_0 = vtstq_s16(row7, row7);
uint8x16_t row10_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row1_ne_0),
vreinterpretq_u8_u16(row0_ne_0));
uint8x16_t row32_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row3_ne_0),
vreinterpretq_u8_u16(row2_ne_0));
uint8x16_t row54_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row5_ne_0),
vreinterpretq_u8_u16(row4_ne_0));
uint8x16_t row76_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row7_ne_0),
vreinterpretq_u8_u16(row6_ne_0));
/* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
const uint8x16_t bitmap_mask =
vreinterpretq_u8_u64(vdupq_n_u64(0x0102040810204080));
uint8x16_t bitmap_rows_10 = vandq_u8(row10_ne_0, bitmap_mask);
uint8x16_t bitmap_rows_32 = vandq_u8(row32_ne_0, bitmap_mask);
uint8x16_t bitmap_rows_54 = vandq_u8(row54_ne_0, bitmap_mask);
uint8x16_t bitmap_rows_76 = vandq_u8(row76_ne_0, bitmap_mask);
uint8x16_t bitmap_rows_3210 = vpaddq_u8(bitmap_rows_32, bitmap_rows_10);
uint8x16_t bitmap_rows_7654 = vpaddq_u8(bitmap_rows_76, bitmap_rows_54);
uint8x16_t bitmap_rows_76543210 = vpaddq_u8(bitmap_rows_7654,
bitmap_rows_3210);
uint8x8_t bitmap_all = vpadd_u8(vget_low_u8(bitmap_rows_76543210),
vget_high_u8(bitmap_rows_76543210));
/* Shift left to remove DC bit. */
bitmap_all =
vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(bitmap_all), 1));
/* Count bits set (number of non-zero coefficients) in bitmap. */
unsigned int non_zero_coefficients = vaddv_u8(vcnt_u8(bitmap_all));
/* Move bitmap to 64-bit scalar register. */
uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
/* Set up state and bit buffer for output bitstream. */
working_state *state_ptr = (working_state *)state;
int free_bits = state_ptr->cur.free_bits;
size_t put_buffer = state_ptr->cur.put_buffer;
/* Encode DC coefficient. */
/* For negative coeffs: diff = abs(coeff) -1 = ~abs(coeff) */
int16x8_t abs_row0 = vabsq_s16(row0);
int16x8_t row0_lz = vclzq_s16(abs_row0);
uint16x8_t row0_mask = vshlq_u16(vcltzq_s16(row0), vnegq_s16(row0_lz));
uint16x8_t row0_diff = veorq_u16(vreinterpretq_u16_s16(abs_row0), row0_mask);
/* Find nbits required to specify sign and amplitude of coefficient. */
unsigned int lz = vgetq_lane_u16(vreinterpretq_u16_s16(row0_lz), 0);
unsigned int nbits = 16 - lz;
/* Emit Huffman-coded symbol and additional diff bits. */
unsigned int diff = vgetq_lane_u16(row0_diff, 0);
PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits], diff)
/* Encode AC coefficients. */
unsigned int r = 0; /* r = run length of zeros */
unsigned int i = 1; /* i = number of coefficients encoded */
/* Code and size information for a run length of 16 zero coefficients */
const unsigned int code_0xf0 = actbl->ehufco[0xf0];
const unsigned int size_0xf0 = actbl->ehufsi[0xf0];
/* The most efficient method of computing nbits and diff depends on the
* number of non-zero coefficients. If the bitmap is not too sparse (> 8
* non-zero AC coefficients), it is beneficial to do all of the work using
* Neon; else we do some of the work using Neon and the rest on demand using
* scalar code.
*/
if (non_zero_coefficients > 8) {
uint8_t block_nbits[DCTSIZE2];
int16x8_t abs_row1 = vabsq_s16(row1);
int16x8_t abs_row2 = vabsq_s16(row2);
int16x8_t abs_row3 = vabsq_s16(row3);
int16x8_t abs_row4 = vabsq_s16(row4);
int16x8_t abs_row5 = vabsq_s16(row5);
int16x8_t abs_row6 = vabsq_s16(row6);
int16x8_t abs_row7 = vabsq_s16(row7);
int16x8_t row1_lz = vclzq_s16(abs_row1);
int16x8_t row2_lz = vclzq_s16(abs_row2);
int16x8_t row3_lz = vclzq_s16(abs_row3);
int16x8_t row4_lz = vclzq_s16(abs_row4);
int16x8_t row5_lz = vclzq_s16(abs_row5);
int16x8_t row6_lz = vclzq_s16(abs_row6);
int16x8_t row7_lz = vclzq_s16(abs_row7);
/* Narrow leading zero count to 8 bits. */
uint8x16_t row01_lz = vuzp1q_u8(vreinterpretq_u8_s16(row0_lz),
vreinterpretq_u8_s16(row1_lz));
uint8x16_t row23_lz = vuzp1q_u8(vreinterpretq_u8_s16(row2_lz),
vreinterpretq_u8_s16(row3_lz));
uint8x16_t row45_lz = vuzp1q_u8(vreinterpretq_u8_s16(row4_lz),
vreinterpretq_u8_s16(row5_lz));
uint8x16_t row67_lz = vuzp1q_u8(vreinterpretq_u8_s16(row6_lz),
vreinterpretq_u8_s16(row7_lz));
/* Compute nbits needed to specify magnitude of each coefficient. */
uint8x16_t row01_nbits = vsubq_u8(vdupq_n_u8(16), row01_lz);
uint8x16_t row23_nbits = vsubq_u8(vdupq_n_u8(16), row23_lz);
uint8x16_t row45_nbits = vsubq_u8(vdupq_n_u8(16), row45_lz);
uint8x16_t row67_nbits = vsubq_u8(vdupq_n_u8(16), row67_lz);
/* Store nbits. */
vst1q_u8(block_nbits + 0 * DCTSIZE, row01_nbits);
vst1q_u8(block_nbits + 2 * DCTSIZE, row23_nbits);
vst1q_u8(block_nbits + 4 * DCTSIZE, row45_nbits);
vst1q_u8(block_nbits + 6 * DCTSIZE, row67_nbits);
/* Mask bits not required to specify sign and amplitude of diff. */
uint16x8_t row1_mask = vshlq_u16(vcltzq_s16(row1), vnegq_s16(row1_lz));
uint16x8_t row2_mask = vshlq_u16(vcltzq_s16(row2), vnegq_s16(row2_lz));
uint16x8_t row3_mask = vshlq_u16(vcltzq_s16(row3), vnegq_s16(row3_lz));
uint16x8_t row4_mask = vshlq_u16(vcltzq_s16(row4), vnegq_s16(row4_lz));
uint16x8_t row5_mask = vshlq_u16(vcltzq_s16(row5), vnegq_s16(row5_lz));
uint16x8_t row6_mask = vshlq_u16(vcltzq_s16(row6), vnegq_s16(row6_lz));
uint16x8_t row7_mask = vshlq_u16(vcltzq_s16(row7), vnegq_s16(row7_lz));
/* diff = abs(coeff) ^ sign(coeff) [no-op for positive coefficients] */
uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1),
row1_mask);
uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2),
row2_mask);
uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3),
row3_mask);
uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4),
row4_mask);
uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5),
row5_mask);
uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6),
row6_mask);
uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7),
row7_mask);
/* Store diff bits. */
vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
while (bitmap != 0) {
r = BUILTIN_CLZLL(bitmap);
i += r;
bitmap <<= r;
nbits = block_nbits[i];
diff = block_diff[i];
while (r > 15) {
/* If run length > 15, emit special run-length-16 codes. */
PUT_BITS(code_0xf0, size_0xf0)
r -= 16;
}
/* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
unsigned int rs = (r << 4) + nbits;
PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
i++;
bitmap <<= 1;
}
} else if (bitmap != 0) {
uint16_t block_abs[DCTSIZE2];
/* Compute and store absolute value of coefficients. */
int16x8_t abs_row1 = vabsq_s16(row1);
int16x8_t abs_row2 = vabsq_s16(row2);
int16x8_t abs_row3 = vabsq_s16(row3);
int16x8_t abs_row4 = vabsq_s16(row4);
int16x8_t abs_row5 = vabsq_s16(row5);
int16x8_t abs_row6 = vabsq_s16(row6);
int16x8_t abs_row7 = vabsq_s16(row7);
vst1q_u16(block_abs + 0 * DCTSIZE, vreinterpretq_u16_s16(abs_row0));
vst1q_u16(block_abs + 1 * DCTSIZE, vreinterpretq_u16_s16(abs_row1));
vst1q_u16(block_abs + 2 * DCTSIZE, vreinterpretq_u16_s16(abs_row2));
vst1q_u16(block_abs + 3 * DCTSIZE, vreinterpretq_u16_s16(abs_row3));
vst1q_u16(block_abs + 4 * DCTSIZE, vreinterpretq_u16_s16(abs_row4));
vst1q_u16(block_abs + 5 * DCTSIZE, vreinterpretq_u16_s16(abs_row5));
vst1q_u16(block_abs + 6 * DCTSIZE, vreinterpretq_u16_s16(abs_row6));
vst1q_u16(block_abs + 7 * DCTSIZE, vreinterpretq_u16_s16(abs_row7));
/* Compute diff bits (without nbits mask) and store. */
uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1),
vcltzq_s16(row1));
uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2),
vcltzq_s16(row2));
uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3),
vcltzq_s16(row3));
uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4),
vcltzq_s16(row4));
uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5),
vcltzq_s16(row5));
uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6),
vcltzq_s16(row6));
uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7),
vcltzq_s16(row7));
vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
/* Same as above but must mask diff bits and compute nbits on demand. */
while (bitmap != 0) {
r = BUILTIN_CLZLL(bitmap);
i += r;
bitmap <<= r;
lz = BUILTIN_CLZ(block_abs[i]);
nbits = 32 - lz;
diff = ((unsigned int)block_diff[i] << lz) >> lz;
while (r > 15) {
/* If run length > 15, emit special run-length-16 codes. */
PUT_BITS(code_0xf0, size_0xf0)
r -= 16;
}
/* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
unsigned int rs = (r << 4) + nbits;
PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
i++;
bitmap <<= 1;
}
}
/* If the last coefficient(s) were zero, emit an end-of-block (EOB) code.
* The value of RS for the EOB code is 0.
*/
if (i != 64) {
PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
}
state_ptr->cur.put_buffer = put_buffer;
state_ptr->cur.free_bits = free_bits;
return buffer;
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -0,0 +1,28 @@
/*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* How to obtain memory alignment for structures and variables */
#if defined(_MSC_VER)
#define ALIGN(alignment) __declspec(align(alignment))
#elif defined(__clang__) || defined(__GNUC__)
#define ALIGN(alignment) __attribute__((aligned(alignment)))
#else
#error "Unknown compiler"
#endif

@ -0,0 +1,160 @@
/*
* jccolor-neon.c - colorspace conversion (Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#define JPEG_INTERNALS
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
#include "align.h"
#include "neon-compat.h"
#include <arm_neon.h>
/* RGB -> YCbCr conversion constants */
#define F_0_298 19595
#define F_0_587 38470
#define F_0_113 7471
#define F_0_168 11059
#define F_0_331 21709
#define F_0_500 32768
#define F_0_418 27439
#define F_0_081 5329
ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
F_0_298, F_0_587, F_0_113, F_0_168,
F_0_331, F_0_500, F_0_418, F_0_081
};
/* Include inline routines for colorspace extensions. */
#if defined(__aarch64__) || defined(_M_ARM64)
#include "aarch64/jccolext-neon.c"
#else
#include "aarch32/jccolext-neon.c"
#endif
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#define RGB_RED EXT_RGB_RED
#define RGB_GREEN EXT_RGB_GREEN
#define RGB_BLUE EXT_RGB_BLUE
#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
#define jsimd_rgb_ycc_convert_neon jsimd_extrgb_ycc_convert_neon
#if defined(__aarch64__) || defined(_M_ARM64)
#include "aarch64/jccolext-neon.c"
#else
#include "aarch32/jccolext-neon.c"
#endif
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_rgb_ycc_convert_neon
#define RGB_RED EXT_RGBX_RED
#define RGB_GREEN EXT_RGBX_GREEN
#define RGB_BLUE EXT_RGBX_BLUE
#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
#define jsimd_rgb_ycc_convert_neon jsimd_extrgbx_ycc_convert_neon
#if defined(__aarch64__) || defined(_M_ARM64)
#include "aarch64/jccolext-neon.c"
#else
#include "aarch32/jccolext-neon.c"
#endif
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_rgb_ycc_convert_neon
#define RGB_RED EXT_BGR_RED
#define RGB_GREEN EXT_BGR_GREEN
#define RGB_BLUE EXT_BGR_BLUE
#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
#define jsimd_rgb_ycc_convert_neon jsimd_extbgr_ycc_convert_neon
#if defined(__aarch64__) || defined(_M_ARM64)
#include "aarch64/jccolext-neon.c"
#else
#include "aarch32/jccolext-neon.c"
#endif
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_rgb_ycc_convert_neon
#define RGB_RED EXT_BGRX_RED
#define RGB_GREEN EXT_BGRX_GREEN
#define RGB_BLUE EXT_BGRX_BLUE
#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
#define jsimd_rgb_ycc_convert_neon jsimd_extbgrx_ycc_convert_neon
#if defined(__aarch64__) || defined(_M_ARM64)
#include "aarch64/jccolext-neon.c"
#else
#include "aarch32/jccolext-neon.c"
#endif
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_rgb_ycc_convert_neon
#define RGB_RED EXT_XBGR_RED
#define RGB_GREEN EXT_XBGR_GREEN
#define RGB_BLUE EXT_XBGR_BLUE
#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
#define jsimd_rgb_ycc_convert_neon jsimd_extxbgr_ycc_convert_neon
#if defined(__aarch64__) || defined(_M_ARM64)
#include "aarch64/jccolext-neon.c"
#else
#include "aarch32/jccolext-neon.c"
#endif
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_rgb_ycc_convert_neon
#define RGB_RED EXT_XRGB_RED
#define RGB_GREEN EXT_XRGB_GREEN
#define RGB_BLUE EXT_XRGB_BLUE
#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
#define jsimd_rgb_ycc_convert_neon jsimd_extxrgb_ycc_convert_neon
#if defined(__aarch64__) || defined(_M_ARM64)
#include "aarch64/jccolext-neon.c"
#else
#include "aarch32/jccolext-neon.c"
#endif
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_rgb_ycc_convert_neon

@ -0,0 +1,120 @@
/*
* jcgray-neon.c - grayscale colorspace conversion (Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#define JPEG_INTERNALS
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
#include "align.h"
#include <arm_neon.h>
/* RGB -> Grayscale conversion constants */
#define F_0_298 19595
#define F_0_587 38470
#define F_0_113 7471
/* Include inline routines for colorspace extensions. */
#include "jcgryext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#define RGB_RED EXT_RGB_RED
#define RGB_GREEN EXT_RGB_GREEN
#define RGB_BLUE EXT_RGB_BLUE
#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
#define jsimd_rgb_gray_convert_neon jsimd_extrgb_gray_convert_neon
#include "jcgryext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_rgb_gray_convert_neon
#define RGB_RED EXT_RGBX_RED
#define RGB_GREEN EXT_RGBX_GREEN
#define RGB_BLUE EXT_RGBX_BLUE
#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
#define jsimd_rgb_gray_convert_neon jsimd_extrgbx_gray_convert_neon
#include "jcgryext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_rgb_gray_convert_neon
#define RGB_RED EXT_BGR_RED
#define RGB_GREEN EXT_BGR_GREEN
#define RGB_BLUE EXT_BGR_BLUE
#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
#define jsimd_rgb_gray_convert_neon jsimd_extbgr_gray_convert_neon
#include "jcgryext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_rgb_gray_convert_neon
#define RGB_RED EXT_BGRX_RED
#define RGB_GREEN EXT_BGRX_GREEN
#define RGB_BLUE EXT_BGRX_BLUE
#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
#define jsimd_rgb_gray_convert_neon jsimd_extbgrx_gray_convert_neon
#include "jcgryext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_rgb_gray_convert_neon
#define RGB_RED EXT_XBGR_RED
#define RGB_GREEN EXT_XBGR_GREEN
#define RGB_BLUE EXT_XBGR_BLUE
#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
#define jsimd_rgb_gray_convert_neon jsimd_extxbgr_gray_convert_neon
#include "jcgryext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_rgb_gray_convert_neon
#define RGB_RED EXT_XRGB_RED
#define RGB_GREEN EXT_XRGB_GREEN
#define RGB_BLUE EXT_XRGB_BLUE
#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
#define jsimd_rgb_gray_convert_neon jsimd_extxrgb_gray_convert_neon
#include "jcgryext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_rgb_gray_convert_neon

@ -0,0 +1,106 @@
/*
* jcgryext-neon.c - grayscale colorspace conversion (Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* This file is included by jcgray-neon.c */
/* RGB -> Grayscale conversion is defined by the following equation:
* Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
*
* Avoid floating point arithmetic by using shifted integer constants:
* 0.29899597 = 19595 * 2^-16
* 0.58700561 = 38470 * 2^-16
* 0.11399841 = 7471 * 2^-16
* These constants are defined in jcgray-neon.c
*
* This is the same computation as the RGB -> Y portion of RGB -> YCbCr.
*/
void jsimd_rgb_gray_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
JSAMPIMAGE output_buf, JDIMENSION output_row,
int num_rows)
{
JSAMPROW inptr;
JSAMPROW outptr;
/* Allocate temporary buffer for final (image_width % 16) pixels in row. */
ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
while (--num_rows >= 0) {
inptr = *input_buf++;
outptr = output_buf[0][output_row];
output_row++;
int cols_remaining = image_width;
for (; cols_remaining > 0; cols_remaining -= 16) {
/* To prevent buffer overread by the vector load instructions, the last
* (image_width % 16) columns of data are first memcopied to a temporary
* buffer large enough to accommodate the vector load.
*/
if (cols_remaining < 16) {
memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
inptr = tmp_buf;
}
#if RGB_PIXELSIZE == 4
uint8x16x4_t input_pixels = vld4q_u8(inptr);
#else
uint8x16x3_t input_pixels = vld3q_u8(inptr);
#endif
uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
/* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
uint32x4_t y_ll = vmull_n_u16(vget_low_u16(r_l), F_0_298);
uint32x4_t y_lh = vmull_n_u16(vget_high_u16(r_l), F_0_298);
uint32x4_t y_hl = vmull_n_u16(vget_low_u16(r_h), F_0_298);
uint32x4_t y_hh = vmull_n_u16(vget_high_u16(r_h), F_0_298);
y_ll = vmlal_n_u16(y_ll, vget_low_u16(g_l), F_0_587);
y_lh = vmlal_n_u16(y_lh, vget_high_u16(g_l), F_0_587);
y_hl = vmlal_n_u16(y_hl, vget_low_u16(g_h), F_0_587);
y_hh = vmlal_n_u16(y_hh, vget_high_u16(g_h), F_0_587);
y_ll = vmlal_n_u16(y_ll, vget_low_u16(b_l), F_0_113);
y_lh = vmlal_n_u16(y_lh, vget_high_u16(b_l), F_0_113);
y_hl = vmlal_n_u16(y_hl, vget_low_u16(b_h), F_0_113);
y_hh = vmlal_n_u16(y_hh, vget_high_u16(b_h), F_0_113);
/* Descale Y values (rounding right shift) and narrow to 16-bit. */
uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
vrshrn_n_u32(y_lh, 16));
uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
vrshrn_n_u32(y_hh, 16));
/* Narrow Y values to 8-bit and store to memory. Buffer overwrite is
* permitted up to the next multiple of ALIGN_SIZE bytes.
*/
vst1q_u8(outptr, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
/* Increment pointers. */
inptr += (16 * RGB_PIXELSIZE);
outptr += 16;
}
}
}

@ -0,0 +1,131 @@
/*
* jchuff.h
*
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1997, Thomas G. Lane.
* libjpeg-turbo Modifications:
* Copyright (C) 2009, 2018, 2021, D. R. Commander.
* Copyright (C) 2018, Matthias Räncker.
* Copyright (C) 2020-2021, Arm Limited.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*/
/* Expanded entropy encoder object for Huffman encoding.
*
* The savable_state subrecord contains fields that change within an MCU,
* but must not be updated permanently until we complete the MCU.
*/
#if defined(__aarch64__) || defined(_M_ARM64)
#define BIT_BUF_SIZE 64
#else
#define BIT_BUF_SIZE 32
#endif
typedef struct {
size_t put_buffer; /* current bit accumulation buffer */
int free_bits; /* # of bits available in it */
int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
} savable_state;
typedef struct {
JOCTET *next_output_byte; /* => next byte to write in buffer */
size_t free_in_buffer; /* # of byte spaces remaining in buffer */
savable_state cur; /* Current bit buffer & DC state */
j_compress_ptr cinfo; /* dump_buffer needs access to this */
int simd;
} working_state;
/* Outputting bits to the file */
/* Output byte b and, speculatively, an additional 0 byte. 0xFF must be encoded
* as 0xFF 0x00, so the output buffer pointer is advanced by 2 if the byte is
* 0xFF. Otherwise, the output buffer pointer is advanced by 1, and the
* speculative 0 byte will be overwritten by the next byte.
*/
#define EMIT_BYTE(b) { \
buffer[0] = (JOCTET)(b); \
buffer[1] = 0; \
buffer -= -2 + ((JOCTET)(b) < 0xFF); \
}
/* Output the entire bit buffer. If there are no 0xFF bytes in it, then write
* directly to the output buffer. Otherwise, use the EMIT_BYTE() macro to
* encode 0xFF as 0xFF 0x00.
*/
#if defined(__aarch64__) || defined(_M_ARM64)
#define FLUSH() { \
if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
EMIT_BYTE(put_buffer >> 56) \
EMIT_BYTE(put_buffer >> 48) \
EMIT_BYTE(put_buffer >> 40) \
EMIT_BYTE(put_buffer >> 32) \
EMIT_BYTE(put_buffer >> 24) \
EMIT_BYTE(put_buffer >> 16) \
EMIT_BYTE(put_buffer >> 8) \
EMIT_BYTE(put_buffer ) \
} else { \
*((uint64_t *)buffer) = BUILTIN_BSWAP64(put_buffer); \
buffer += 8; \
} \
}
#else
#if defined(_MSC_VER) && !defined(__clang__)
#define SPLAT() { \
buffer[0] = (JOCTET)(put_buffer >> 24); \
buffer[1] = (JOCTET)(put_buffer >> 16); \
buffer[2] = (JOCTET)(put_buffer >> 8); \
buffer[3] = (JOCTET)(put_buffer ); \
buffer += 4; \
}
#else
#define SPLAT() { \
put_buffer = __builtin_bswap32(put_buffer); \
__asm__("str %1, [%0], #4" : "+r" (buffer) : "r" (put_buffer)); \
}
#endif
#define FLUSH() { \
if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \
EMIT_BYTE(put_buffer >> 24) \
EMIT_BYTE(put_buffer >> 16) \
EMIT_BYTE(put_buffer >> 8) \
EMIT_BYTE(put_buffer ) \
} else { \
SPLAT(); \
} \
}
#endif
/* Fill the bit buffer to capacity with the leading bits from code, then output
* the bit buffer and put the remaining bits from code into the bit buffer.
*/
#define PUT_AND_FLUSH(code, size) { \
put_buffer = (put_buffer << (size + free_bits)) | (code >> -free_bits); \
FLUSH() \
free_bits += BIT_BUF_SIZE; \
put_buffer = code; \
}
/* Insert code into the bit buffer and output the bit buffer if needed.
* NOTE: We can't flush with free_bits == 0, since the left shift in
* PUT_AND_FLUSH() would have undefined behavior.
*/
#define PUT_BITS(code, size) { \
free_bits -= size; \
if (free_bits < 0) \
PUT_AND_FLUSH(code, size) \
else \
put_buffer = (put_buffer << size) | code; \
}
#define PUT_CODE(code, size, diff) { \
diff |= code << nbits; \
nbits += size; \
PUT_BITS(diff, nbits) \
}

@ -0,0 +1,622 @@
/*
* jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon)
*
* Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#define JPEG_INTERNALS
#include "jconfigint.h"
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
#include "neon-compat.h"
#include <arm_neon.h>
/* Data preparation for encode_mcu_AC_first().
*
* The equivalent scalar C function (encode_mcu_AC_first_prepare()) can be
* found in jcphuff.c.
*/
void jsimd_encode_mcu_AC_first_prepare_neon
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
JCOEF *values, size_t *zerobits)
{
JCOEF *values_ptr = values;
JCOEF *diff_values_ptr = values + DCTSIZE2;
/* Rows of coefficients to zero (since they haven't been processed) */
int i, rows_to_zero = 8;
for (i = 0; i < Sl / 16; i++) {
int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
/* Isolate sign of coefficients. */
int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
/* Compute absolute value of coefficients and apply point transform Al. */
int16x8_t abs_coefs1 = vabsq_s16(coefs1);
int16x8_t abs_coefs2 = vabsq_s16(coefs2);
coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
/* Compute diff values. */
int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
/* Store transformed coefficients and diff values. */
vst1q_s16(values_ptr, coefs1);
vst1q_s16(values_ptr + DCTSIZE, coefs2);
vst1q_s16(diff_values_ptr, diff1);
vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
values_ptr += 16;
diff_values_ptr += 16;
jpeg_natural_order_start += 16;
rows_to_zero -= 2;
}
/* Same operation but for remaining partial vector */
int remaining_coefs = Sl % 16;
if (remaining_coefs > 8) {
int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
int16x8_t coefs2 = vdupq_n_s16(0);
switch (remaining_coefs) {
case 15:
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
FALLTHROUGH /*FALLTHROUGH*/
case 14:
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
FALLTHROUGH /*FALLTHROUGH*/
case 13:
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
FALLTHROUGH /*FALLTHROUGH*/
case 12:
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
FALLTHROUGH /*FALLTHROUGH*/
case 11:
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
FALLTHROUGH /*FALLTHROUGH*/
case 10:
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
FALLTHROUGH /*FALLTHROUGH*/
case 9:
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
FALLTHROUGH /*FALLTHROUGH*/
default:
break;
}
/* Isolate sign of coefficients. */
int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
/* Compute absolute value of coefficients and apply point transform Al. */
int16x8_t abs_coefs1 = vabsq_s16(coefs1);
int16x8_t abs_coefs2 = vabsq_s16(coefs2);
coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
/* Compute diff values. */
int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
/* Store transformed coefficients and diff values. */
vst1q_s16(values_ptr, coefs1);
vst1q_s16(values_ptr + DCTSIZE, coefs2);
vst1q_s16(diff_values_ptr, diff1);
vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
values_ptr += 16;
diff_values_ptr += 16;
rows_to_zero -= 2;
} else if (remaining_coefs > 0) {
int16x8_t coefs = vdupq_n_s16(0);
switch (remaining_coefs) {
case 8:
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
FALLTHROUGH /*FALLTHROUGH*/
case 7:
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
FALLTHROUGH /*FALLTHROUGH*/
case 6:
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
FALLTHROUGH /*FALLTHROUGH*/
case 5:
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
FALLTHROUGH /*FALLTHROUGH*/
case 4:
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
FALLTHROUGH /*FALLTHROUGH*/
case 3:
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
FALLTHROUGH /*FALLTHROUGH*/
case 2:
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
FALLTHROUGH /*FALLTHROUGH*/
case 1:
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
FALLTHROUGH /*FALLTHROUGH*/
default:
break;
}
/* Isolate sign of coefficients. */
int16x8_t sign_coefs = vshrq_n_s16(coefs, 15);
/* Compute absolute value of coefficients and apply point transform Al. */
int16x8_t abs_coefs = vabsq_s16(coefs);
coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
/* Compute diff values. */
int16x8_t diff = veorq_s16(coefs, sign_coefs);
/* Store transformed coefficients and diff values. */
vst1q_s16(values_ptr, coefs);
vst1q_s16(diff_values_ptr, diff);
values_ptr += 8;
diff_values_ptr += 8;
rows_to_zero--;
}
/* Zero remaining memory in the values and diff_values blocks. */
for (i = 0; i < rows_to_zero; i++) {
vst1q_s16(values_ptr, vdupq_n_s16(0));
vst1q_s16(diff_values_ptr, vdupq_n_s16(0));
values_ptr += 8;
diff_values_ptr += 8;
}
/* Construct zerobits bitmap. A set bit means that the corresponding
* coefficient != 0.
*/
int16x8_t row0 = vld1q_s16(values + 0 * DCTSIZE);
int16x8_t row1 = vld1q_s16(values + 1 * DCTSIZE);
int16x8_t row2 = vld1q_s16(values + 2 * DCTSIZE);
int16x8_t row3 = vld1q_s16(values + 3 * DCTSIZE);
int16x8_t row4 = vld1q_s16(values + 4 * DCTSIZE);
int16x8_t row5 = vld1q_s16(values + 5 * DCTSIZE);
int16x8_t row6 = vld1q_s16(values + 6 * DCTSIZE);
int16x8_t row7 = vld1q_s16(values + 7 * DCTSIZE);
uint8x8_t row0_eq0 = vmovn_u16(vceqq_s16(row0, vdupq_n_s16(0)));
uint8x8_t row1_eq0 = vmovn_u16(vceqq_s16(row1, vdupq_n_s16(0)));
uint8x8_t row2_eq0 = vmovn_u16(vceqq_s16(row2, vdupq_n_s16(0)));
uint8x8_t row3_eq0 = vmovn_u16(vceqq_s16(row3, vdupq_n_s16(0)));
uint8x8_t row4_eq0 = vmovn_u16(vceqq_s16(row4, vdupq_n_s16(0)));
uint8x8_t row5_eq0 = vmovn_u16(vceqq_s16(row5, vdupq_n_s16(0)));
uint8x8_t row6_eq0 = vmovn_u16(vceqq_s16(row6, vdupq_n_s16(0)));
uint8x8_t row7_eq0 = vmovn_u16(vceqq_s16(row7, vdupq_n_s16(0)));
/* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
const uint8x8_t bitmap_mask =
vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
row0_eq0 = vand_u8(row0_eq0, bitmap_mask);
row1_eq0 = vand_u8(row1_eq0, bitmap_mask);
row2_eq0 = vand_u8(row2_eq0, bitmap_mask);
row3_eq0 = vand_u8(row3_eq0, bitmap_mask);
row4_eq0 = vand_u8(row4_eq0, bitmap_mask);
row5_eq0 = vand_u8(row5_eq0, bitmap_mask);
row6_eq0 = vand_u8(row6_eq0, bitmap_mask);
row7_eq0 = vand_u8(row7_eq0, bitmap_mask);
uint8x8_t bitmap_rows_01 = vpadd_u8(row0_eq0, row1_eq0);
uint8x8_t bitmap_rows_23 = vpadd_u8(row2_eq0, row3_eq0);
uint8x8_t bitmap_rows_45 = vpadd_u8(row4_eq0, row5_eq0);
uint8x8_t bitmap_rows_67 = vpadd_u8(row6_eq0, row7_eq0);
uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
#if defined(__aarch64__) || defined(_M_ARM64)
/* Move bitmap to a 64-bit scalar register. */
uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
/* Store zerobits bitmap. */
*zerobits = ~bitmap;
#else
/* Move bitmap to two 32-bit scalar registers. */
uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
/* Store zerobits bitmap. */
zerobits[0] = ~bitmap0;
zerobits[1] = ~bitmap1;
#endif
}
/* Data preparation for encode_mcu_AC_refine().
*
* The equivalent scalar C function (encode_mcu_AC_refine_prepare()) can be
* found in jcphuff.c.
*/
int jsimd_encode_mcu_AC_refine_prepare_neon
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
JCOEF *absvalues, size_t *bits)
{
/* Temporary storage buffers for data used to compute the signbits bitmap and
* the end-of-block (EOB) position
*/
uint8_t coef_sign_bits[64];
uint8_t coef_eq1_bits[64];
JCOEF *absvalues_ptr = absvalues;
uint8_t *coef_sign_bits_ptr = coef_sign_bits;
uint8_t *eq1_bits_ptr = coef_eq1_bits;
/* Rows of coefficients to zero (since they haven't been processed) */
int i, rows_to_zero = 8;
for (i = 0; i < Sl / 16; i++) {
int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
/* Compute and store data for signbits bitmap. */
uint8x8_t sign_coefs1 =
vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
uint8x8_t sign_coefs2 =
vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
vst1_u8(coef_sign_bits_ptr, sign_coefs1);
vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
/* Compute absolute value of coefficients and apply point transform Al. */
int16x8_t abs_coefs1 = vabsq_s16(coefs1);
int16x8_t abs_coefs2 = vabsq_s16(coefs2);
coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
vst1q_s16(absvalues_ptr, coefs1);
vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
/* Test whether transformed coefficient values == 1 (used to find EOB
* position.)
*/
uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
vst1_u8(eq1_bits_ptr, coefs_eq11);
vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
absvalues_ptr += 16;
coef_sign_bits_ptr += 16;
eq1_bits_ptr += 16;
jpeg_natural_order_start += 16;
rows_to_zero -= 2;
}
/* Same operation but for remaining partial vector */
int remaining_coefs = Sl % 16;
if (remaining_coefs > 8) {
int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
int16x8_t coefs2 = vdupq_n_s16(0);
switch (remaining_coefs) {
case 15:
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
FALLTHROUGH /*FALLTHROUGH*/
case 14:
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
FALLTHROUGH /*FALLTHROUGH*/
case 13:
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
FALLTHROUGH /*FALLTHROUGH*/
case 12:
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
FALLTHROUGH /*FALLTHROUGH*/
case 11:
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
FALLTHROUGH /*FALLTHROUGH*/
case 10:
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
FALLTHROUGH /*FALLTHROUGH*/
case 9:
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
FALLTHROUGH /*FALLTHROUGH*/
default:
break;
}
/* Compute and store data for signbits bitmap. */
uint8x8_t sign_coefs1 =
vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
uint8x8_t sign_coefs2 =
vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
vst1_u8(coef_sign_bits_ptr, sign_coefs1);
vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
/* Compute absolute value of coefficients and apply point transform Al. */
int16x8_t abs_coefs1 = vabsq_s16(coefs1);
int16x8_t abs_coefs2 = vabsq_s16(coefs2);
coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
vst1q_s16(absvalues_ptr, coefs1);
vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
/* Test whether transformed coefficient values == 1 (used to find EOB
* position.)
*/
uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
vst1_u8(eq1_bits_ptr, coefs_eq11);
vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
absvalues_ptr += 16;
coef_sign_bits_ptr += 16;
eq1_bits_ptr += 16;
jpeg_natural_order_start += 16;
rows_to_zero -= 2;
} else if (remaining_coefs > 0) {
int16x8_t coefs = vdupq_n_s16(0);
switch (remaining_coefs) {
case 8:
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
FALLTHROUGH /*FALLTHROUGH*/
case 7:
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
FALLTHROUGH /*FALLTHROUGH*/
case 6:
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
FALLTHROUGH /*FALLTHROUGH*/
case 5:
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
FALLTHROUGH /*FALLTHROUGH*/
case 4:
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
FALLTHROUGH /*FALLTHROUGH*/
case 3:
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
FALLTHROUGH /*FALLTHROUGH*/
case 2:
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
FALLTHROUGH /*FALLTHROUGH*/
case 1:
coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
FALLTHROUGH /*FALLTHROUGH*/
default:
break;
}
/* Compute and store data for signbits bitmap. */
uint8x8_t sign_coefs =
vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15)));
vst1_u8(coef_sign_bits_ptr, sign_coefs);
/* Compute absolute value of coefficients and apply point transform Al. */
int16x8_t abs_coefs = vabsq_s16(coefs);
coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
vst1q_s16(absvalues_ptr, coefs);
/* Test whether transformed coefficient values == 1 (used to find EOB
* position.)
*/
uint8x8_t coefs_eq1 = vmovn_u16(vceqq_s16(coefs, vdupq_n_s16(1)));
vst1_u8(eq1_bits_ptr, coefs_eq1);
absvalues_ptr += 8;
coef_sign_bits_ptr += 8;
eq1_bits_ptr += 8;
rows_to_zero--;
}
/* Zero remaining memory in blocks. */
for (i = 0; i < rows_to_zero; i++) {
vst1q_s16(absvalues_ptr, vdupq_n_s16(0));
vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0));
vst1_u8(eq1_bits_ptr, vdup_n_u8(0));
absvalues_ptr += 8;
coef_sign_bits_ptr += 8;
eq1_bits_ptr += 8;
}
/* Construct zerobits bitmap. */
int16x8_t abs_row0 = vld1q_s16(absvalues + 0 * DCTSIZE);
int16x8_t abs_row1 = vld1q_s16(absvalues + 1 * DCTSIZE);
int16x8_t abs_row2 = vld1q_s16(absvalues + 2 * DCTSIZE);
int16x8_t abs_row3 = vld1q_s16(absvalues + 3 * DCTSIZE);
int16x8_t abs_row4 = vld1q_s16(absvalues + 4 * DCTSIZE);
int16x8_t abs_row5 = vld1q_s16(absvalues + 5 * DCTSIZE);
int16x8_t abs_row6 = vld1q_s16(absvalues + 6 * DCTSIZE);
int16x8_t abs_row7 = vld1q_s16(absvalues + 7 * DCTSIZE);
uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_s16(abs_row0, vdupq_n_s16(0)));
uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_s16(abs_row1, vdupq_n_s16(0)));
uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_s16(abs_row2, vdupq_n_s16(0)));
uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_s16(abs_row3, vdupq_n_s16(0)));
uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_s16(abs_row4, vdupq_n_s16(0)));
uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_s16(abs_row5, vdupq_n_s16(0)));
uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_s16(abs_row6, vdupq_n_s16(0)));
uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_s16(abs_row7, vdupq_n_s16(0)));
/* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
const uint8x8_t bitmap_mask =
vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
abs_row0_eq0 = vand_u8(abs_row0_eq0, bitmap_mask);
abs_row1_eq0 = vand_u8(abs_row1_eq0, bitmap_mask);
abs_row2_eq0 = vand_u8(abs_row2_eq0, bitmap_mask);
abs_row3_eq0 = vand_u8(abs_row3_eq0, bitmap_mask);
abs_row4_eq0 = vand_u8(abs_row4_eq0, bitmap_mask);
abs_row5_eq0 = vand_u8(abs_row5_eq0, bitmap_mask);
abs_row6_eq0 = vand_u8(abs_row6_eq0, bitmap_mask);
abs_row7_eq0 = vand_u8(abs_row7_eq0, bitmap_mask);
uint8x8_t bitmap_rows_01 = vpadd_u8(abs_row0_eq0, abs_row1_eq0);
uint8x8_t bitmap_rows_23 = vpadd_u8(abs_row2_eq0, abs_row3_eq0);
uint8x8_t bitmap_rows_45 = vpadd_u8(abs_row4_eq0, abs_row5_eq0);
uint8x8_t bitmap_rows_67 = vpadd_u8(abs_row6_eq0, abs_row7_eq0);
uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
#if defined(__aarch64__) || defined(_M_ARM64)
/* Move bitmap to a 64-bit scalar register. */
uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
/* Store zerobits bitmap. */
bits[0] = ~bitmap;
#else
/* Move bitmap to two 32-bit scalar registers. */
uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
/* Store zerobits bitmap. */
bits[0] = ~bitmap0;
bits[1] = ~bitmap1;
#endif
/* Construct signbits bitmap. */
uint8x8_t signbits_row0 = vld1_u8(coef_sign_bits + 0 * DCTSIZE);
uint8x8_t signbits_row1 = vld1_u8(coef_sign_bits + 1 * DCTSIZE);
uint8x8_t signbits_row2 = vld1_u8(coef_sign_bits + 2 * DCTSIZE);
uint8x8_t signbits_row3 = vld1_u8(coef_sign_bits + 3 * DCTSIZE);
uint8x8_t signbits_row4 = vld1_u8(coef_sign_bits + 4 * DCTSIZE);
uint8x8_t signbits_row5 = vld1_u8(coef_sign_bits + 5 * DCTSIZE);
uint8x8_t signbits_row6 = vld1_u8(coef_sign_bits + 6 * DCTSIZE);
uint8x8_t signbits_row7 = vld1_u8(coef_sign_bits + 7 * DCTSIZE);
signbits_row0 = vand_u8(signbits_row0, bitmap_mask);
signbits_row1 = vand_u8(signbits_row1, bitmap_mask);
signbits_row2 = vand_u8(signbits_row2, bitmap_mask);
signbits_row3 = vand_u8(signbits_row3, bitmap_mask);
signbits_row4 = vand_u8(signbits_row4, bitmap_mask);
signbits_row5 = vand_u8(signbits_row5, bitmap_mask);
signbits_row6 = vand_u8(signbits_row6, bitmap_mask);
signbits_row7 = vand_u8(signbits_row7, bitmap_mask);
bitmap_rows_01 = vpadd_u8(signbits_row0, signbits_row1);
bitmap_rows_23 = vpadd_u8(signbits_row2, signbits_row3);
bitmap_rows_45 = vpadd_u8(signbits_row4, signbits_row5);
bitmap_rows_67 = vpadd_u8(signbits_row6, signbits_row7);
bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
#if defined(__aarch64__) || defined(_M_ARM64)
/* Move bitmap to a 64-bit scalar register. */
bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
/* Store signbits bitmap. */
bits[1] = ~bitmap;
#else
/* Move bitmap to two 32-bit scalar registers. */
bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
/* Store signbits bitmap. */
bits[2] = ~bitmap0;
bits[3] = ~bitmap1;
#endif
/* Construct bitmap to find EOB position (the index of the last coefficient
* equal to 1.)
*/
uint8x8_t row0_eq1 = vld1_u8(coef_eq1_bits + 0 * DCTSIZE);
uint8x8_t row1_eq1 = vld1_u8(coef_eq1_bits + 1 * DCTSIZE);
uint8x8_t row2_eq1 = vld1_u8(coef_eq1_bits + 2 * DCTSIZE);
uint8x8_t row3_eq1 = vld1_u8(coef_eq1_bits + 3 * DCTSIZE);
uint8x8_t row4_eq1 = vld1_u8(coef_eq1_bits + 4 * DCTSIZE);
uint8x8_t row5_eq1 = vld1_u8(coef_eq1_bits + 5 * DCTSIZE);
uint8x8_t row6_eq1 = vld1_u8(coef_eq1_bits + 6 * DCTSIZE);
uint8x8_t row7_eq1 = vld1_u8(coef_eq1_bits + 7 * DCTSIZE);
row0_eq1 = vand_u8(row0_eq1, bitmap_mask);
row1_eq1 = vand_u8(row1_eq1, bitmap_mask);
row2_eq1 = vand_u8(row2_eq1, bitmap_mask);
row3_eq1 = vand_u8(row3_eq1, bitmap_mask);
row4_eq1 = vand_u8(row4_eq1, bitmap_mask);
row5_eq1 = vand_u8(row5_eq1, bitmap_mask);
row6_eq1 = vand_u8(row6_eq1, bitmap_mask);
row7_eq1 = vand_u8(row7_eq1, bitmap_mask);
bitmap_rows_01 = vpadd_u8(row0_eq1, row1_eq1);
bitmap_rows_23 = vpadd_u8(row2_eq1, row3_eq1);
bitmap_rows_45 = vpadd_u8(row4_eq1, row5_eq1);
bitmap_rows_67 = vpadd_u8(row6_eq1, row7_eq1);
bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
#if defined(__aarch64__) || defined(_M_ARM64)
/* Move bitmap to a 64-bit scalar register. */
bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
/* Return EOB position. */
if (bitmap == 0) {
/* EOB position is defined to be 0 if all coefficients != 1. */
return 0;
} else {
return 63 - BUILTIN_CLZLL(bitmap);
}
#else
/* Move bitmap to two 32-bit scalar registers. */
bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
/* Return EOB position. */
if (bitmap0 == 0 && bitmap1 == 0) {
return 0;
} else if (bitmap1 != 0) {
return 63 - BUILTIN_CLZ(bitmap1);
} else {
return 31 - BUILTIN_CLZ(bitmap0);
}
#endif
}

@ -0,0 +1,192 @@
/*
* jcsample-neon.c - downsampling (Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#define JPEG_INTERNALS
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
#include "align.h"
#include <arm_neon.h>
ALIGN(16) static const uint8_t jsimd_h2_downsample_consts[] = {
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 0 */
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 1 */
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 2 */
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 3 */
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 4 */
0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 5 */
0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 6 */
0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 7 */
0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 8 */
0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, /* Pad 9 */
0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, /* Pad 10 */
0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, /* Pad 11 */
0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, /* Pad 12 */
0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* Pad 13 */
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, /* Pad 14 */
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Pad 15 */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
};
/* Downsample pixel values of a single component.
* This version handles the common case of 2:1 horizontal and 1:1 vertical,
* without smoothing.
*/
void jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
JDIMENSION v_samp_factor,
JDIMENSION width_in_blocks,
JSAMPARRAY input_data, JSAMPARRAY output_data)
{
JSAMPROW inptr, outptr;
/* Load expansion mask to pad remaining elements of last DCT block. */
const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
const uint8x16_t expand_mask =
vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
/* Load bias pattern (alternating every pixel.) */
/* { 0, 1, 0, 1, 0, 1, 0, 1 } */
const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00010000));
unsigned i, outrow;
for (outrow = 0; outrow < v_samp_factor; outrow++) {
outptr = output_data[outrow];
inptr = input_data[outrow];
/* Downsample all but the last DCT block of pixels. */
for (i = 0; i < width_in_blocks - 1; i++) {
uint8x16_t pixels = vld1q_u8(inptr + i * 2 * DCTSIZE);
/* Add adjacent pixel values, widen to 16-bit, and add bias. */
uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
/* Divide total by 2 and narrow to 8-bit. */
uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
/* Store samples to memory. */
vst1_u8(outptr + i * DCTSIZE, samples_u8);
}
/* Load pixels in last DCT block into a table. */
uint8x16_t pixels = vld1q_u8(inptr + (width_in_blocks - 1) * 2 * DCTSIZE);
#if defined(__aarch64__) || defined(_M_ARM64)
/* Pad the empty elements with the value of the last pixel. */
pixels = vqtbl1q_u8(pixels, expand_mask);
#else
uint8x8x2_t table = { { vget_low_u8(pixels), vget_high_u8(pixels) } };
pixels = vcombine_u8(vtbl2_u8(table, vget_low_u8(expand_mask)),
vtbl2_u8(table, vget_high_u8(expand_mask)));
#endif
/* Add adjacent pixel values, widen to 16-bit, and add bias. */
uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
/* Divide total by 2, narrow to 8-bit, and store. */
uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
}
}
/* Downsample pixel values of a single component.
* This version handles the standard case of 2:1 horizontal and 2:1 vertical,
* without smoothing.
*/
void jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
JDIMENSION v_samp_factor,
JDIMENSION width_in_blocks,
JSAMPARRAY input_data, JSAMPARRAY output_data)
{
JSAMPROW inptr0, inptr1, outptr;
/* Load expansion mask to pad remaining elements of last DCT block. */
const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
const uint8x16_t expand_mask =
vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
/* Load bias pattern (alternating every pixel.) */
/* { 1, 2, 1, 2, 1, 2, 1, 2 } */
const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00020001));
unsigned i, outrow;
for (outrow = 0; outrow < v_samp_factor; outrow++) {
outptr = output_data[outrow];
inptr0 = input_data[outrow];
inptr1 = input_data[outrow + 1];
/* Downsample all but the last DCT block of pixels. */
for (i = 0; i < width_in_blocks - 1; i++) {
uint8x16_t pixels_r0 = vld1q_u8(inptr0 + i * 2 * DCTSIZE);
uint8x16_t pixels_r1 = vld1q_u8(inptr1 + i * 2 * DCTSIZE);
/* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
/* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate.
*/
samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
/* Divide total by 4 and narrow to 8-bit. */
uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
/* Store samples to memory and increment pointers. */
vst1_u8(outptr + i * DCTSIZE, samples_u8);
}
/* Load pixels in last DCT block into a table. */
uint8x16_t pixels_r0 =
vld1q_u8(inptr0 + (width_in_blocks - 1) * 2 * DCTSIZE);
uint8x16_t pixels_r1 =
vld1q_u8(inptr1 + (width_in_blocks - 1) * 2 * DCTSIZE);
#if defined(__aarch64__) || defined(_M_ARM64)
/* Pad the empty elements with the value of the last pixel. */
pixels_r0 = vqtbl1q_u8(pixels_r0, expand_mask);
pixels_r1 = vqtbl1q_u8(pixels_r1, expand_mask);
#else
uint8x8x2_t table_r0 =
{ { vget_low_u8(pixels_r0), vget_high_u8(pixels_r0) } };
uint8x8x2_t table_r1 =
{ { vget_low_u8(pixels_r1), vget_high_u8(pixels_r1) } };
pixels_r0 = vcombine_u8(vtbl2_u8(table_r0, vget_low_u8(expand_mask)),
vtbl2_u8(table_r0, vget_high_u8(expand_mask)));
pixels_r1 = vcombine_u8(vtbl2_u8(table_r1, vget_low_u8(expand_mask)),
vtbl2_u8(table_r1, vget_high_u8(expand_mask)));
#endif
/* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
/* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate. */
samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
/* Divide total by 4, narrow to 8-bit, and store. */
uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
}
}

@ -0,0 +1,374 @@
/*
* jdcolext-neon.c - colorspace conversion (Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* This file is included by jdcolor-neon.c. */
/* YCbCr -> RGB conversion is defined by the following equations:
* R = Y + 1.40200 * (Cr - 128)
* G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
* B = Y + 1.77200 * (Cb - 128)
*
* Scaled integer constants are used to avoid floating-point arithmetic:
* 0.3441467 = 11277 * 2^-15
* 0.7141418 = 23401 * 2^-15
* 1.4020386 = 22971 * 2^-14
* 1.7720337 = 29033 * 2^-14
* These constants are defined in jdcolor-neon.c.
*
* To ensure correct results, rounding is used when descaling.
*/
/* Notes on safe memory access for YCbCr -> RGB conversion routines:
*
* Input memory buffers can be safely overread up to the next multiple of
* ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
* jmemmgr.c.
*
* The output buffer cannot safely be written beyond output_width, since
* output_buf points to a possibly unpadded row in the decompressed image
* buffer allocated by the calling program.
*/
void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width, JSAMPIMAGE input_buf,
JDIMENSION input_row, JSAMPARRAY output_buf,
int num_rows)
{
JSAMPROW outptr;
/* Pointers to Y, Cb, and Cr data */
JSAMPROW inptr0, inptr1, inptr2;
const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
const int16x8_t neg_128 = vdupq_n_s16(-128);
while (--num_rows >= 0) {
inptr0 = input_buf[0][input_row];
inptr1 = input_buf[1][input_row];
inptr2 = input_buf[2][input_row];
input_row++;
outptr = *output_buf++;
int cols_remaining = output_width;
for (; cols_remaining >= 16; cols_remaining -= 16) {
uint8x16_t y = vld1q_u8(inptr0);
uint8x16_t cb = vld1q_u8(inptr1);
uint8x16_t cr = vld1q_u8(inptr2);
/* Subtract 128 from Cb and Cr. */
int16x8_t cr_128_l =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
vget_low_u8(cr)));
int16x8_t cr_128_h =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
vget_high_u8(cr)));
int16x8_t cb_128_l =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
vget_low_u8(cb)));
int16x8_t cb_128_h =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
vget_high_u8(cb)));
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
int32x4_t g_sub_y_ll = vmull_lane_s16(vget_low_s16(cb_128_l), consts, 0);
int32x4_t g_sub_y_lh = vmull_lane_s16(vget_high_s16(cb_128_l),
consts, 0);
int32x4_t g_sub_y_hl = vmull_lane_s16(vget_low_s16(cb_128_h), consts, 0);
int32x4_t g_sub_y_hh = vmull_lane_s16(vget_high_s16(cb_128_h),
consts, 0);
g_sub_y_ll = vmlsl_lane_s16(g_sub_y_ll, vget_low_s16(cr_128_l),
consts, 1);
g_sub_y_lh = vmlsl_lane_s16(g_sub_y_lh, vget_high_s16(cr_128_l),
consts, 1);
g_sub_y_hl = vmlsl_lane_s16(g_sub_y_hl, vget_low_s16(cr_128_h),
consts, 1);
g_sub_y_hh = vmlsl_lane_s16(g_sub_y_hh, vget_high_s16(cr_128_h),
consts, 1);
/* Descale G components: shift right 15, round, and narrow to 16-bit. */
int16x8_t g_sub_y_l = vcombine_s16(vrshrn_n_s32(g_sub_y_ll, 15),
vrshrn_n_s32(g_sub_y_lh, 15));
int16x8_t g_sub_y_h = vcombine_s16(vrshrn_n_s32(g_sub_y_hl, 15),
vrshrn_n_s32(g_sub_y_hh, 15));
/* Compute R-Y: 1.40200 * (Cr - 128) */
int16x8_t r_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_l, 1),
consts, 2);
int16x8_t r_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_h, 1),
consts, 2);
/* Compute B-Y: 1.77200 * (Cb - 128) */
int16x8_t b_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_l, 1),
consts, 3);
int16x8_t b_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_h, 1),
consts, 3);
/* Add Y. */
int16x8_t r_l =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_l),
vget_low_u8(y)));
int16x8_t r_h =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_h),
vget_high_u8(y)));
int16x8_t b_l =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_l),
vget_low_u8(y)));
int16x8_t b_h =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_h),
vget_high_u8(y)));
int16x8_t g_l =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_l),
vget_low_u8(y)));
int16x8_t g_h =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_h),
vget_high_u8(y)));
#if RGB_PIXELSIZE == 4
uint8x16x4_t rgba;
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
rgba.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
rgba.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
rgba.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
/* Set alpha channel to opaque (0xFF). */
rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
/* Store RGBA pixel data to memory. */
vst4q_u8(outptr, rgba);
#elif RGB_PIXELSIZE == 3
uint8x16x3_t rgb;
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
rgb.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
rgb.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
rgb.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
/* Store RGB pixel data to memory. */
vst3q_u8(outptr, rgb);
#else
/* Pack R, G, and B values in ratio 5:6:5. */
uint16x8_t rgb565_l = vqshluq_n_s16(r_l, 8);
rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(g_l, 8), 5);
rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(b_l, 8), 11);
uint16x8_t rgb565_h = vqshluq_n_s16(r_h, 8);
rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(g_h, 8), 5);
rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(b_h, 8), 11);
/* Store RGB pixel data to memory. */
vst1q_u16((uint16_t *)outptr, rgb565_l);
vst1q_u16(((uint16_t *)outptr) + 8, rgb565_h);
#endif
/* Increment pointers. */
inptr0 += 16;
inptr1 += 16;
inptr2 += 16;
outptr += (RGB_PIXELSIZE * 16);
}
if (cols_remaining >= 8) {
uint8x8_t y = vld1_u8(inptr0);
uint8x8_t cb = vld1_u8(inptr1);
uint8x8_t cr = vld1_u8(inptr2);
/* Subtract 128 from Cb and Cr. */
int16x8_t cr_128 =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
int16x8_t cb_128 =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
/* Descale G components: shift right 15, round, and narrow to 16-bit. */
int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
vrshrn_n_s32(g_sub_y_h, 15));
/* Compute R-Y: 1.40200 * (Cr - 128) */
int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
consts, 2);
/* Compute B-Y: 1.77200 * (Cb - 128) */
int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
consts, 3);
/* Add Y. */
int16x8_t r =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
int16x8_t b =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
int16x8_t g =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
#if RGB_PIXELSIZE == 4
uint8x8x4_t rgba;
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
rgba.val[RGB_RED] = vqmovun_s16(r);
rgba.val[RGB_GREEN] = vqmovun_s16(g);
rgba.val[RGB_BLUE] = vqmovun_s16(b);
/* Set alpha channel to opaque (0xFF). */
rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
/* Store RGBA pixel data to memory. */
vst4_u8(outptr, rgba);
#elif RGB_PIXELSIZE == 3
uint8x8x3_t rgb;
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
rgb.val[RGB_RED] = vqmovun_s16(r);
rgb.val[RGB_GREEN] = vqmovun_s16(g);
rgb.val[RGB_BLUE] = vqmovun_s16(b);
/* Store RGB pixel data to memory. */
vst3_u8(outptr, rgb);
#else
/* Pack R, G, and B values in ratio 5:6:5. */
uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
/* Store RGB pixel data to memory. */
vst1q_u16((uint16_t *)outptr, rgb565);
#endif
/* Increment pointers. */
inptr0 += 8;
inptr1 += 8;
inptr2 += 8;
outptr += (RGB_PIXELSIZE * 8);
cols_remaining -= 8;
}
/* Handle the tail elements. */
if (cols_remaining > 0) {
uint8x8_t y = vld1_u8(inptr0);
uint8x8_t cb = vld1_u8(inptr1);
uint8x8_t cr = vld1_u8(inptr2);
/* Subtract 128 from Cb and Cr. */
int16x8_t cr_128 =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
int16x8_t cb_128 =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
/* Descale G components: shift right 15, round, and narrow to 16-bit. */
int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
vrshrn_n_s32(g_sub_y_h, 15));
/* Compute R-Y: 1.40200 * (Cr - 128) */
int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
consts, 2);
/* Compute B-Y: 1.77200 * (Cb - 128) */
int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
consts, 3);
/* Add Y. */
int16x8_t r =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
int16x8_t b =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
int16x8_t g =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
#if RGB_PIXELSIZE == 4
uint8x8x4_t rgba;
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
rgba.val[RGB_RED] = vqmovun_s16(r);
rgba.val[RGB_GREEN] = vqmovun_s16(g);
rgba.val[RGB_BLUE] = vqmovun_s16(b);
/* Set alpha channel to opaque (0xFF). */
rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
/* Store RGBA pixel data to memory. */
switch (cols_remaining) {
case 7:
vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba, 6);
FALLTHROUGH /*FALLTHROUGH*/
case 6:
vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba, 5);
FALLTHROUGH /*FALLTHROUGH*/
case 5:
vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba, 4);
FALLTHROUGH /*FALLTHROUGH*/
case 4:
vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba, 3);
FALLTHROUGH /*FALLTHROUGH*/
case 3:
vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba, 2);
FALLTHROUGH /*FALLTHROUGH*/
case 2:
vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba, 1);
FALLTHROUGH /*FALLTHROUGH*/
case 1:
vst4_lane_u8(outptr, rgba, 0);
FALLTHROUGH /*FALLTHROUGH*/
default:
break;
}
#elif RGB_PIXELSIZE == 3
uint8x8x3_t rgb;
/* Convert each component to unsigned and narrow, clamping to [0-255]. */
rgb.val[RGB_RED] = vqmovun_s16(r);
rgb.val[RGB_GREEN] = vqmovun_s16(g);
rgb.val[RGB_BLUE] = vqmovun_s16(b);
/* Store RGB pixel data to memory. */
switch (cols_remaining) {
case 7:
vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb, 6);
FALLTHROUGH /*FALLTHROUGH*/
case 6:
vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb, 5);
FALLTHROUGH /*FALLTHROUGH*/
case 5:
vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb, 4);
FALLTHROUGH /*FALLTHROUGH*/
case 4:
vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb, 3);
FALLTHROUGH /*FALLTHROUGH*/
case 3:
vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb, 2);
FALLTHROUGH /*FALLTHROUGH*/
case 2:
vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb, 1);
FALLTHROUGH /*FALLTHROUGH*/
case 1:
vst3_lane_u8(outptr, rgb, 0);
FALLTHROUGH /*FALLTHROUGH*/
default:
break;
}
#else
/* Pack R, G, and B values in ratio 5:6:5. */
uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
/* Store RGB565 pixel data to memory. */
switch (cols_remaining) {
case 7:
vst1q_lane_u16((uint16_t *)(outptr + 6 * RGB_PIXELSIZE), rgb565, 6);
FALLTHROUGH /*FALLTHROUGH*/
case 6:
vst1q_lane_u16((uint16_t *)(outptr + 5 * RGB_PIXELSIZE), rgb565, 5);
FALLTHROUGH /*FALLTHROUGH*/
case 5:
vst1q_lane_u16((uint16_t *)(outptr + 4 * RGB_PIXELSIZE), rgb565, 4);
FALLTHROUGH /*FALLTHROUGH*/
case 4:
vst1q_lane_u16((uint16_t *)(outptr + 3 * RGB_PIXELSIZE), rgb565, 3);
FALLTHROUGH /*FALLTHROUGH*/
case 3:
vst1q_lane_u16((uint16_t *)(outptr + 2 * RGB_PIXELSIZE), rgb565, 2);
FALLTHROUGH /*FALLTHROUGH*/
case 2:
vst1q_lane_u16((uint16_t *)(outptr + RGB_PIXELSIZE), rgb565, 1);
FALLTHROUGH /*FALLTHROUGH*/
case 1:
vst1q_lane_u16((uint16_t *)outptr, rgb565, 0);
FALLTHROUGH /*FALLTHROUGH*/
default:
break;
}
#endif
}
}
}

@ -0,0 +1,142 @@
/*
* jdcolor-neon.c - colorspace conversion (Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#define JPEG_INTERNALS
#include "jconfigint.h"
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
#include "align.h"
#include <arm_neon.h>
/* YCbCr -> RGB conversion constants */
#define F_0_344 11277 /* 0.3441467 = 11277 * 2^-15 */
#define F_0_714 23401 /* 0.7141418 = 23401 * 2^-15 */
#define F_1_402 22971 /* 1.4020386 = 22971 * 2^-14 */
#define F_1_772 29033 /* 1.7720337 = 29033 * 2^-14 */
ALIGN(16) static const int16_t jsimd_ycc_rgb_convert_neon_consts[] = {
-F_0_344, F_0_714, F_1_402, F_1_772
};
/* Include inline routines for colorspace extensions. */
#include "jdcolext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#define RGB_RED EXT_RGB_RED
#define RGB_GREEN EXT_RGB_GREEN
#define RGB_BLUE EXT_RGB_BLUE
#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extrgb_convert_neon
#include "jdcolext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_ycc_rgb_convert_neon
#define RGB_RED EXT_RGBX_RED
#define RGB_GREEN EXT_RGBX_GREEN
#define RGB_BLUE EXT_RGBX_BLUE
#define RGB_ALPHA 3
#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extrgbx_convert_neon
#include "jdcolext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_ALPHA
#undef RGB_PIXELSIZE
#undef jsimd_ycc_rgb_convert_neon
#define RGB_RED EXT_BGR_RED
#define RGB_GREEN EXT_BGR_GREEN
#define RGB_BLUE EXT_BGR_BLUE
#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extbgr_convert_neon
#include "jdcolext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_ycc_rgb_convert_neon
#define RGB_RED EXT_BGRX_RED
#define RGB_GREEN EXT_BGRX_GREEN
#define RGB_BLUE EXT_BGRX_BLUE
#define RGB_ALPHA 3
#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extbgrx_convert_neon
#include "jdcolext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_ALPHA
#undef RGB_PIXELSIZE
#undef jsimd_ycc_rgb_convert_neon
#define RGB_RED EXT_XBGR_RED
#define RGB_GREEN EXT_XBGR_GREEN
#define RGB_BLUE EXT_XBGR_BLUE
#define RGB_ALPHA 0
#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extxbgr_convert_neon
#include "jdcolext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_ALPHA
#undef RGB_PIXELSIZE
#undef jsimd_ycc_rgb_convert_neon
#define RGB_RED EXT_XRGB_RED
#define RGB_GREEN EXT_XRGB_GREEN
#define RGB_BLUE EXT_XRGB_BLUE
#define RGB_ALPHA 0
#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extxrgb_convert_neon
#include "jdcolext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_ALPHA
#undef RGB_PIXELSIZE
#undef jsimd_ycc_rgb_convert_neon
/* YCbCr -> RGB565 Conversion */
#define RGB_PIXELSIZE 2
#define jsimd_ycc_rgb_convert_neon jsimd_ycc_rgb565_convert_neon
#include "jdcolext-neon.c"
#undef RGB_PIXELSIZE
#undef jsimd_ycc_rgb_convert_neon

@ -0,0 +1,145 @@
/*
* jdmerge-neon.c - merged upsampling/color conversion (Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#define JPEG_INTERNALS
#include "jconfigint.h"
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
#include "align.h"
#include <arm_neon.h>
/* YCbCr -> RGB conversion constants */
#define F_0_344 11277 /* 0.3441467 = 11277 * 2^-15 */
#define F_0_714 23401 /* 0.7141418 = 23401 * 2^-15 */
#define F_1_402 22971 /* 1.4020386 = 22971 * 2^-14 */
#define F_1_772 29033 /* 1.7720337 = 29033 * 2^-14 */
ALIGN(16) static const int16_t jsimd_ycc_rgb_convert_neon_consts[] = {
-F_0_344, F_0_714, F_1_402, F_1_772
};
/* Include inline routines for colorspace extensions. */
#include "jdmrgext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#define RGB_RED EXT_RGB_RED
#define RGB_GREEN EXT_RGB_GREEN
#define RGB_BLUE EXT_RGB_BLUE
#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extrgb_merged_upsample_neon
#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extrgb_merged_upsample_neon
#include "jdmrgext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_h2v1_merged_upsample_neon
#undef jsimd_h2v2_merged_upsample_neon
#define RGB_RED EXT_RGBX_RED
#define RGB_GREEN EXT_RGBX_GREEN
#define RGB_BLUE EXT_RGBX_BLUE
#define RGB_ALPHA 3
#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extrgbx_merged_upsample_neon
#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extrgbx_merged_upsample_neon
#include "jdmrgext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_ALPHA
#undef RGB_PIXELSIZE
#undef jsimd_h2v1_merged_upsample_neon
#undef jsimd_h2v2_merged_upsample_neon
#define RGB_RED EXT_BGR_RED
#define RGB_GREEN EXT_BGR_GREEN
#define RGB_BLUE EXT_BGR_BLUE
#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extbgr_merged_upsample_neon
#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extbgr_merged_upsample_neon
#include "jdmrgext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_PIXELSIZE
#undef jsimd_h2v1_merged_upsample_neon
#undef jsimd_h2v2_merged_upsample_neon
#define RGB_RED EXT_BGRX_RED
#define RGB_GREEN EXT_BGRX_GREEN
#define RGB_BLUE EXT_BGRX_BLUE
#define RGB_ALPHA 3
#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extbgrx_merged_upsample_neon
#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extbgrx_merged_upsample_neon
#include "jdmrgext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_ALPHA
#undef RGB_PIXELSIZE
#undef jsimd_h2v1_merged_upsample_neon
#undef jsimd_h2v2_merged_upsample_neon
#define RGB_RED EXT_XBGR_RED
#define RGB_GREEN EXT_XBGR_GREEN
#define RGB_BLUE EXT_XBGR_BLUE
#define RGB_ALPHA 0
#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extxbgr_merged_upsample_neon
#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extxbgr_merged_upsample_neon
#include "jdmrgext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_ALPHA
#undef RGB_PIXELSIZE
#undef jsimd_h2v1_merged_upsample_neon
#undef jsimd_h2v2_merged_upsample_neon
#define RGB_RED EXT_XRGB_RED
#define RGB_GREEN EXT_XRGB_GREEN
#define RGB_BLUE EXT_XRGB_BLUE
#define RGB_ALPHA 0
#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extxrgb_merged_upsample_neon
#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extxrgb_merged_upsample_neon
#include "jdmrgext-neon.c"
#undef RGB_RED
#undef RGB_GREEN
#undef RGB_BLUE
#undef RGB_ALPHA
#undef RGB_PIXELSIZE
#undef jsimd_h2v1_merged_upsample_neon

@ -0,0 +1,723 @@
/*
* jdmrgext-neon.c - merged upsampling/color conversion (Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* This file is included by jdmerge-neon.c. */
/* These routines combine simple (non-fancy, i.e. non-smooth) h2v1 or h2v2
* chroma upsampling and YCbCr -> RGB color conversion into a single function.
*
* As with the standalone functions, YCbCr -> RGB conversion is defined by the
* following equations:
* R = Y + 1.40200 * (Cr - 128)
* G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
* B = Y + 1.77200 * (Cb - 128)
*
* Scaled integer constants are used to avoid floating-point arithmetic:
* 0.3441467 = 11277 * 2^-15
* 0.7141418 = 23401 * 2^-15
* 1.4020386 = 22971 * 2^-14
* 1.7720337 = 29033 * 2^-14
* These constants are defined in jdmerge-neon.c.
*
* To ensure correct results, rounding is used when descaling.
*/
/* Notes on safe memory access for merged upsampling/YCbCr -> RGB conversion
* routines:
*
* Input memory buffers can be safely overread up to the next multiple of
* ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
* jmemmgr.c.
*
* The output buffer cannot safely be written beyond output_width, since
* output_buf points to a possibly unpadded row in the decompressed image
* buffer allocated by the calling program.
*/
/* Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
*/
void jsimd_h2v1_merged_upsample_neon(JDIMENSION output_width,
JSAMPIMAGE input_buf,
JDIMENSION in_row_group_ctr,
JSAMPARRAY output_buf)
{
JSAMPROW outptr;
/* Pointers to Y, Cb, and Cr data */
JSAMPROW inptr0, inptr1, inptr2;
const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
const int16x8_t neg_128 = vdupq_n_s16(-128);
inptr0 = input_buf[0][in_row_group_ctr];
inptr1 = input_buf[1][in_row_group_ctr];
inptr2 = input_buf[2][in_row_group_ctr];
outptr = output_buf[0];
int cols_remaining = output_width;
for (; cols_remaining >= 16; cols_remaining -= 16) {
/* De-interleave Y component values into two separate vectors, one
* containing the component values with even-numbered indices and one
* containing the component values with odd-numbered indices.
*/
uint8x8x2_t y = vld2_u8(inptr0);
uint8x8_t cb = vld1_u8(inptr1);
uint8x8_t cr = vld1_u8(inptr2);
/* Subtract 128 from Cb and Cr. */
int16x8_t cr_128 =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
int16x8_t cb_128 =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
/* Descale G components: shift right 15, round, and narrow to 16-bit. */
int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
vrshrn_n_s32(g_sub_y_h, 15));
/* Compute R-Y: 1.40200 * (Cr - 128) */
int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
/* Compute B-Y: 1.77200 * (Cb - 128) */
int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
/* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
* "odd" Y component values. This effectively upsamples the chroma
* components horizontally.
*/
int16x8_t g_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
y.val[0]));
int16x8_t r_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
y.val[0]));
int16x8_t b_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
y.val[0]));
int16x8_t g_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
y.val[1]));
int16x8_t r_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
y.val[1]));
int16x8_t b_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
y.val[1]));
/* Convert each component to unsigned and narrow, clamping to [0-255].
* Re-interleave the "even" and "odd" component values.
*/
uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
#ifdef RGB_ALPHA
uint8x16x4_t rgba;
rgba.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
rgba.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
rgba.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
/* Set alpha channel to opaque (0xFF). */
rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
/* Store RGBA pixel data to memory. */
vst4q_u8(outptr, rgba);
#else
uint8x16x3_t rgb;
rgb.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
rgb.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
rgb.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
/* Store RGB pixel data to memory. */
vst3q_u8(outptr, rgb);
#endif
/* Increment pointers. */
inptr0 += 16;
inptr1 += 8;
inptr2 += 8;
outptr += (RGB_PIXELSIZE * 16);
}
if (cols_remaining > 0) {
/* De-interleave Y component values into two separate vectors, one
* containing the component values with even-numbered indices and one
* containing the component values with odd-numbered indices.
*/
uint8x8x2_t y = vld2_u8(inptr0);
uint8x8_t cb = vld1_u8(inptr1);
uint8x8_t cr = vld1_u8(inptr2);
/* Subtract 128 from Cb and Cr. */
int16x8_t cr_128 =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
int16x8_t cb_128 =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
/* Descale G components: shift right 15, round, and narrow to 16-bit. */
int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
vrshrn_n_s32(g_sub_y_h, 15));
/* Compute R-Y: 1.40200 * (Cr - 128) */
int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
/* Compute B-Y: 1.77200 * (Cb - 128) */
int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
/* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
* "odd" Y component values. This effectively upsamples the chroma
* components horizontally.
*/
int16x8_t g_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
y.val[0]));
int16x8_t r_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
y.val[0]));
int16x8_t b_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
y.val[0]));
int16x8_t g_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
y.val[1]));
int16x8_t r_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
y.val[1]));
int16x8_t b_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
y.val[1]));
/* Convert each component to unsigned and narrow, clamping to [0-255].
* Re-interleave the "even" and "odd" component values.
*/
uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
#ifdef RGB_ALPHA
uint8x8x4_t rgba_h;
rgba_h.val[RGB_RED] = r.val[1];
rgba_h.val[RGB_GREEN] = g.val[1];
rgba_h.val[RGB_BLUE] = b.val[1];
/* Set alpha channel to opaque (0xFF). */
rgba_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
uint8x8x4_t rgba_l;
rgba_l.val[RGB_RED] = r.val[0];
rgba_l.val[RGB_GREEN] = g.val[0];
rgba_l.val[RGB_BLUE] = b.val[0];
/* Set alpha channel to opaque (0xFF). */
rgba_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
/* Store RGBA pixel data to memory. */
switch (cols_remaining) {
case 15:
vst4_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgba_h, 6);
FALLTHROUGH /*FALLTHROUGH*/
case 14:
vst4_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgba_h, 5);
FALLTHROUGH /*FALLTHROUGH*/
case 13:
vst4_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgba_h, 4);
FALLTHROUGH /*FALLTHROUGH*/
case 12:
vst4_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgba_h, 3);
FALLTHROUGH /*FALLTHROUGH*/
case 11:
vst4_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgba_h, 2);
FALLTHROUGH /*FALLTHROUGH*/
case 10:
vst4_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgba_h, 1);
FALLTHROUGH /*FALLTHROUGH*/
case 9:
vst4_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgba_h, 0);
FALLTHROUGH /*FALLTHROUGH*/
case 8:
vst4_u8(outptr, rgba_l);
break;
case 7:
vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba_l, 6);
FALLTHROUGH /*FALLTHROUGH*/
case 6:
vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba_l, 5);
FALLTHROUGH /*FALLTHROUGH*/
case 5:
vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba_l, 4);
FALLTHROUGH /*FALLTHROUGH*/
case 4:
vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba_l, 3);
FALLTHROUGH /*FALLTHROUGH*/
case 3:
vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba_l, 2);
FALLTHROUGH /*FALLTHROUGH*/
case 2:
vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba_l, 1);
FALLTHROUGH /*FALLTHROUGH*/
case 1:
vst4_lane_u8(outptr, rgba_l, 0);
FALLTHROUGH /*FALLTHROUGH*/
default:
break;
}
#else
uint8x8x3_t rgb_h;
rgb_h.val[RGB_RED] = r.val[1];
rgb_h.val[RGB_GREEN] = g.val[1];
rgb_h.val[RGB_BLUE] = b.val[1];
uint8x8x3_t rgb_l;
rgb_l.val[RGB_RED] = r.val[0];
rgb_l.val[RGB_GREEN] = g.val[0];
rgb_l.val[RGB_BLUE] = b.val[0];
/* Store RGB pixel data to memory. */
switch (cols_remaining) {
case 15:
vst3_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgb_h, 6);
FALLTHROUGH /*FALLTHROUGH*/
case 14:
vst3_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgb_h, 5);
FALLTHROUGH /*FALLTHROUGH*/
case 13:
vst3_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgb_h, 4);
FALLTHROUGH /*FALLTHROUGH*/
case 12:
vst3_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgb_h, 3);
FALLTHROUGH /*FALLTHROUGH*/
case 11:
vst3_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgb_h, 2);
FALLTHROUGH /*FALLTHROUGH*/
case 10:
vst3_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgb_h, 1);
FALLTHROUGH /*FALLTHROUGH*/
case 9:
vst3_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgb_h, 0);
FALLTHROUGH /*FALLTHROUGH*/
case 8:
vst3_u8(outptr, rgb_l);
break;
case 7:
vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb_l, 6);
FALLTHROUGH /*FALLTHROUGH*/
case 6:
vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb_l, 5);
FALLTHROUGH /*FALLTHROUGH*/
case 5:
vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb_l, 4);
FALLTHROUGH /*FALLTHROUGH*/
case 4:
vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb_l, 3);
FALLTHROUGH /*FALLTHROUGH*/
case 3:
vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb_l, 2);
FALLTHROUGH /*FALLTHROUGH*/
case 2:
vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb_l, 1);
FALLTHROUGH /*FALLTHROUGH*/
case 1:
vst3_lane_u8(outptr, rgb_l, 0);
FALLTHROUGH /*FALLTHROUGH*/
default:
break;
}
#endif
}
}
/* Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
*
* See comments above for details regarding color conversion and safe memory
* access.
*/
void jsimd_h2v2_merged_upsample_neon(JDIMENSION output_width,
JSAMPIMAGE input_buf,
JDIMENSION in_row_group_ctr,
JSAMPARRAY output_buf)
{
JSAMPROW outptr0, outptr1;
/* Pointers to Y (both rows), Cb, and Cr data */
JSAMPROW inptr0_0, inptr0_1, inptr1, inptr2;
const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
const int16x8_t neg_128 = vdupq_n_s16(-128);
inptr0_0 = input_buf[0][in_row_group_ctr * 2];
inptr0_1 = input_buf[0][in_row_group_ctr * 2 + 1];
inptr1 = input_buf[1][in_row_group_ctr];
inptr2 = input_buf[2][in_row_group_ctr];
outptr0 = output_buf[0];
outptr1 = output_buf[1];
int cols_remaining = output_width;
for (; cols_remaining >= 16; cols_remaining -= 16) {
/* For each row, de-interleave Y component values into two separate
* vectors, one containing the component values with even-numbered indices
* and one containing the component values with odd-numbered indices.
*/
uint8x8x2_t y0 = vld2_u8(inptr0_0);
uint8x8x2_t y1 = vld2_u8(inptr0_1);
uint8x8_t cb = vld1_u8(inptr1);
uint8x8_t cr = vld1_u8(inptr2);
/* Subtract 128 from Cb and Cr. */
int16x8_t cr_128 =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
int16x8_t cb_128 =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
/* Descale G components: shift right 15, round, and narrow to 16-bit. */
int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
vrshrn_n_s32(g_sub_y_h, 15));
/* Compute R-Y: 1.40200 * (Cr - 128) */
int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
/* Compute B-Y: 1.77200 * (Cb - 128) */
int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
/* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
* the "even" and "odd" Y component values. This effectively upsamples the
* chroma components both horizontally and vertically.
*/
int16x8_t g0_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
y0.val[0]));
int16x8_t r0_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
y0.val[0]));
int16x8_t b0_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
y0.val[0]));
int16x8_t g0_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
y0.val[1]));
int16x8_t r0_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
y0.val[1]));
int16x8_t b0_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
y0.val[1]));
int16x8_t g1_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
y1.val[0]));
int16x8_t r1_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
y1.val[0]));
int16x8_t b1_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
y1.val[0]));
int16x8_t g1_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
y1.val[1]));
int16x8_t r1_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
y1.val[1]));
int16x8_t b1_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
y1.val[1]));
/* Convert each component to unsigned and narrow, clamping to [0-255].
* Re-interleave the "even" and "odd" component values.
*/
uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
#ifdef RGB_ALPHA
uint8x16x4_t rgba0, rgba1;
rgba0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
rgba1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
rgba0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
rgba1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
rgba0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
rgba1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
/* Set alpha channel to opaque (0xFF). */
rgba0.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
rgba1.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
/* Store RGBA pixel data to memory. */
vst4q_u8(outptr0, rgba0);
vst4q_u8(outptr1, rgba1);
#else
uint8x16x3_t rgb0, rgb1;
rgb0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
rgb1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
rgb0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
rgb1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
rgb0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
rgb1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
/* Store RGB pixel data to memory. */
vst3q_u8(outptr0, rgb0);
vst3q_u8(outptr1, rgb1);
#endif
/* Increment pointers. */
inptr0_0 += 16;
inptr0_1 += 16;
inptr1 += 8;
inptr2 += 8;
outptr0 += (RGB_PIXELSIZE * 16);
outptr1 += (RGB_PIXELSIZE * 16);
}
if (cols_remaining > 0) {
/* For each row, de-interleave Y component values into two separate
* vectors, one containing the component values with even-numbered indices
* and one containing the component values with odd-numbered indices.
*/
uint8x8x2_t y0 = vld2_u8(inptr0_0);
uint8x8x2_t y1 = vld2_u8(inptr0_1);
uint8x8_t cb = vld1_u8(inptr1);
uint8x8_t cr = vld1_u8(inptr2);
/* Subtract 128 from Cb and Cr. */
int16x8_t cr_128 =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
int16x8_t cb_128 =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
/* Descale G components: shift right 15, round, and narrow to 16-bit. */
int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
vrshrn_n_s32(g_sub_y_h, 15));
/* Compute R-Y: 1.40200 * (Cr - 128) */
int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
/* Compute B-Y: 1.77200 * (Cb - 128) */
int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
/* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
* the "even" and "odd" Y component values. This effectively upsamples the
* chroma components both horizontally and vertically.
*/
int16x8_t g0_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
y0.val[0]));
int16x8_t r0_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
y0.val[0]));
int16x8_t b0_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
y0.val[0]));
int16x8_t g0_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
y0.val[1]));
int16x8_t r0_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
y0.val[1]));
int16x8_t b0_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
y0.val[1]));
int16x8_t g1_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
y1.val[0]));
int16x8_t r1_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
y1.val[0]));
int16x8_t b1_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
y1.val[0]));
int16x8_t g1_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
y1.val[1]));
int16x8_t r1_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
y1.val[1]));
int16x8_t b1_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
y1.val[1]));
/* Convert each component to unsigned and narrow, clamping to [0-255].
* Re-interleave the "even" and "odd" component values.
*/
uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
#ifdef RGB_ALPHA
uint8x8x4_t rgba0_h, rgba1_h;
rgba0_h.val[RGB_RED] = r0.val[1];
rgba1_h.val[RGB_RED] = r1.val[1];
rgba0_h.val[RGB_GREEN] = g0.val[1];
rgba1_h.val[RGB_GREEN] = g1.val[1];
rgba0_h.val[RGB_BLUE] = b0.val[1];
rgba1_h.val[RGB_BLUE] = b1.val[1];
/* Set alpha channel to opaque (0xFF). */
rgba0_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
rgba1_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
uint8x8x4_t rgba0_l, rgba1_l;
rgba0_l.val[RGB_RED] = r0.val[0];
rgba1_l.val[RGB_RED] = r1.val[0];
rgba0_l.val[RGB_GREEN] = g0.val[0];
rgba1_l.val[RGB_GREEN] = g1.val[0];
rgba0_l.val[RGB_BLUE] = b0.val[0];
rgba1_l.val[RGB_BLUE] = b1.val[0];
/* Set alpha channel to opaque (0xFF). */
rgba0_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
rgba1_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
/* Store RGBA pixel data to memory. */
switch (cols_remaining) {
case 15:
vst4_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgba0_h, 6);
vst4_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgba1_h, 6);
FALLTHROUGH /*FALLTHROUGH*/
case 14:
vst4_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgba0_h, 5);
vst4_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgba1_h, 5);
FALLTHROUGH /*FALLTHROUGH*/
case 13:
vst4_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgba0_h, 4);
vst4_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgba1_h, 4);
FALLTHROUGH /*FALLTHROUGH*/
case 12:
vst4_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgba0_h, 3);
vst4_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgba1_h, 3);
FALLTHROUGH /*FALLTHROUGH*/
case 11:
vst4_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgba0_h, 2);
vst4_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgba1_h, 2);
FALLTHROUGH /*FALLTHROUGH*/
case 10:
vst4_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgba0_h, 1);
vst4_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgba1_h, 1);
FALLTHROUGH /*FALLTHROUGH*/
case 9:
vst4_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgba0_h, 0);
vst4_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgba1_h, 0);
FALLTHROUGH /*FALLTHROUGH*/
case 8:
vst4_u8(outptr0, rgba0_l);
vst4_u8(outptr1, rgba1_l);
break;
case 7:
vst4_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgba0_l, 6);
vst4_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgba1_l, 6);
FALLTHROUGH /*FALLTHROUGH*/
case 6:
vst4_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgba0_l, 5);
vst4_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgba1_l, 5);
FALLTHROUGH /*FALLTHROUGH*/
case 5:
vst4_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgba0_l, 4);
vst4_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgba1_l, 4);
FALLTHROUGH /*FALLTHROUGH*/
case 4:
vst4_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgba0_l, 3);
vst4_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgba1_l, 3);
FALLTHROUGH /*FALLTHROUGH*/
case 3:
vst4_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgba0_l, 2);
vst4_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgba1_l, 2);
FALLTHROUGH /*FALLTHROUGH*/
case 2:
vst4_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgba0_l, 1);
vst4_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgba1_l, 1);
FALLTHROUGH /*FALLTHROUGH*/
case 1:
vst4_lane_u8(outptr0, rgba0_l, 0);
vst4_lane_u8(outptr1, rgba1_l, 0);
FALLTHROUGH /*FALLTHROUGH*/
default:
break;
}
#else
uint8x8x3_t rgb0_h, rgb1_h;
rgb0_h.val[RGB_RED] = r0.val[1];
rgb1_h.val[RGB_RED] = r1.val[1];
rgb0_h.val[RGB_GREEN] = g0.val[1];
rgb1_h.val[RGB_GREEN] = g1.val[1];
rgb0_h.val[RGB_BLUE] = b0.val[1];
rgb1_h.val[RGB_BLUE] = b1.val[1];
uint8x8x3_t rgb0_l, rgb1_l;
rgb0_l.val[RGB_RED] = r0.val[0];
rgb1_l.val[RGB_RED] = r1.val[0];
rgb0_l.val[RGB_GREEN] = g0.val[0];
rgb1_l.val[RGB_GREEN] = g1.val[0];
rgb0_l.val[RGB_BLUE] = b0.val[0];
rgb1_l.val[RGB_BLUE] = b1.val[0];
/* Store RGB pixel data to memory. */
switch (cols_remaining) {
case 15:
vst3_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgb0_h, 6);
vst3_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgb1_h, 6);
FALLTHROUGH /*FALLTHROUGH*/
case 14:
vst3_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgb0_h, 5);
vst3_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgb1_h, 5);
FALLTHROUGH /*FALLTHROUGH*/
case 13:
vst3_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgb0_h, 4);
vst3_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgb1_h, 4);
FALLTHROUGH /*FALLTHROUGH*/
case 12:
vst3_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgb0_h, 3);
vst3_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgb1_h, 3);
FALLTHROUGH /*FALLTHROUGH*/
case 11:
vst3_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgb0_h, 2);
vst3_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgb1_h, 2);
FALLTHROUGH /*FALLTHROUGH*/
case 10:
vst3_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgb0_h, 1);
vst3_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgb1_h, 1);
FALLTHROUGH /*FALLTHROUGH*/
case 9:
vst3_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgb0_h, 0);
vst3_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgb1_h, 0);
FALLTHROUGH /*FALLTHROUGH*/
case 8:
vst3_u8(outptr0, rgb0_l);
vst3_u8(outptr1, rgb1_l);
break;
case 7:
vst3_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgb0_l, 6);
vst3_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgb1_l, 6);
FALLTHROUGH /*FALLTHROUGH*/
case 6:
vst3_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgb0_l, 5);
vst3_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgb1_l, 5);
FALLTHROUGH /*FALLTHROUGH*/
case 5:
vst3_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgb0_l, 4);
vst3_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgb1_l, 4);
FALLTHROUGH /*FALLTHROUGH*/
case 4:
vst3_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgb0_l, 3);
vst3_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgb1_l, 3);
FALLTHROUGH /*FALLTHROUGH*/
case 3:
vst3_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgb0_l, 2);
vst3_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgb1_l, 2);
FALLTHROUGH /*FALLTHROUGH*/
case 2:
vst3_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgb0_l, 1);
vst3_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgb1_l, 1);
FALLTHROUGH /*FALLTHROUGH*/
case 1:
vst3_lane_u8(outptr0, rgb0_l, 0);
vst3_lane_u8(outptr1, rgb1_l, 0);
FALLTHROUGH /*FALLTHROUGH*/
default:
break;
}
#endif
}
}

@ -0,0 +1,569 @@
/*
* jdsample-neon.c - upsampling (Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#define JPEG_INTERNALS
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
#include <arm_neon.h>
/* The diagram below shows a row of samples produced by h2v1 downsampling.
*
* s0 s1 s2
* +---------+---------+---------+
* | | | |
* | p0 p1 | p2 p3 | p4 p5 |
* | | | |
* +---------+---------+---------+
*
* Samples s0-s2 were created by averaging the original pixel component values
* centered at positions p0-p5 above. To approximate those original pixel
* component values, we proportionally blend the adjacent samples in each row.
*
* An upsampled pixel component value is computed by blending the sample
* containing the pixel center with the nearest neighboring sample, in the
* ratio 3:1. For example:
* p1(upsampled) = 3/4 * s0 + 1/4 * s1
* p2(upsampled) = 3/4 * s1 + 1/4 * s0
* When computing the first and last pixel component values in the row, there
* is no adjacent sample to blend, so:
* p0(upsampled) = s0
* p5(upsampled) = s2
*/
void jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,
JDIMENSION downsampled_width,
JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr)
{
JSAMPARRAY output_data = *output_data_ptr;
JSAMPROW inptr, outptr;
int inrow;
unsigned colctr;
/* Set up constants. */
const uint16x8_t one_u16 = vdupq_n_u16(1);
const uint8x8_t three_u8 = vdup_n_u8(3);
for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
inptr = input_data[inrow];
outptr = output_data[inrow];
/* First pixel component value in this row of the original image */
*outptr = (JSAMPLE)GETJSAMPLE(*inptr);
/* 3/4 * containing sample + 1/4 * nearest neighboring sample
* For p1: containing sample = s0, nearest neighboring sample = s1
* For p2: containing sample = s1, nearest neighboring sample = s0
*/
uint8x16_t s0 = vld1q_u8(inptr);
uint8x16_t s1 = vld1q_u8(inptr + 1);
/* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes
* denote low half and high half respectively.
*/
uint16x8_t s1_add_3s0_l =
vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
uint16x8_t s1_add_3s0_h =
vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
uint16x8_t s0_add_3s1_l =
vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
uint16x8_t s0_add_3s1_h =
vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
/* Add ordered dithering bias to odd pixel values. */
s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
/* The offset is initially 1, because the first pixel component has already
* been stored. However, in subsequent iterations of the SIMD loop, this
* offset is (2 * colctr - 1) to stay within the bounds of the sample
* buffers without having to resort to a slow scalar tail case for the last
* (downsampled_width % 16) samples. See "Creation of 2-D sample arrays"
* in jmemmgr.c for more details.
*/
unsigned outptr_offset = 1;
uint8x16x2_t output_pixels;
/* We use software pipelining to maximise performance. The code indented
* an extra two spaces begins the next iteration of the loop.
*/
for (colctr = 16; colctr < downsampled_width; colctr += 16) {
s0 = vld1q_u8(inptr + colctr - 1);
s1 = vld1q_u8(inptr + colctr);
/* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
vrshrn_n_u16(s1_add_3s0_h, 2));
output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
vshrn_n_u16(s0_add_3s1_h, 2));
/* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes
* denote low half and high half respectively.
*/
s1_add_3s0_l =
vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
s1_add_3s0_h =
vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
s0_add_3s1_l =
vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
s0_add_3s1_h =
vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
/* Add ordered dithering bias to odd pixel values. */
s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
/* Store pixel component values to memory. */
vst2q_u8(outptr + outptr_offset, output_pixels);
outptr_offset = 2 * colctr - 1;
}
/* Complete the last iteration of the loop. */
/* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
vrshrn_n_u16(s1_add_3s0_h, 2));
output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
vshrn_n_u16(s0_add_3s1_h, 2));
/* Store pixel component values to memory. */
vst2q_u8(outptr + outptr_offset, output_pixels);
/* Last pixel component value in this row of the original image */
outptr[2 * downsampled_width - 1] =
GETJSAMPLE(inptr[downsampled_width - 1]);
}
}
/* The diagram below shows an array of samples produced by h2v2 downsampling.
*
* s0 s1 s2
* +---------+---------+---------+
* | p0 p1 | p2 p3 | p4 p5 |
* sA | | | |
* | p6 p7 | p8 p9 | p10 p11|
* +---------+---------+---------+
* | p12 p13| p14 p15| p16 p17|
* sB | | | |
* | p18 p19| p20 p21| p22 p23|
* +---------+---------+---------+
* | p24 p25| p26 p27| p28 p29|
* sC | | | |
* | p30 p31| p32 p33| p34 p35|
* +---------+---------+---------+
*
* Samples s0A-s2C were created by averaging the original pixel component
* values centered at positions p0-p35 above. To approximate one of those
* original pixel component values, we proportionally blend the sample
* containing the pixel center with the nearest neighboring samples in each
* row, column, and diagonal.
*
* An upsampled pixel component value is computed by first blending the sample
* containing the pixel center with the nearest neighboring samples in the
* same column, in the ratio 3:1, and then blending each column sum with the
* nearest neighboring column sum, in the ratio 3:1. For example:
* p14(upsampled) = 3/4 * (3/4 * s1B + 1/4 * s1A) +
* 1/4 * (3/4 * s0B + 1/4 * s0A)
* = 9/16 * s1B + 3/16 * s1A + 3/16 * s0B + 1/16 * s0A
* When computing the first and last pixel component values in the row, there
* is no horizontally adjacent sample to blend, so:
* p12(upsampled) = 3/4 * s0B + 1/4 * s0A
* p23(upsampled) = 3/4 * s2B + 1/4 * s2C
* When computing the first and last pixel component values in the column,
* there is no vertically adjacent sample to blend, so:
* p2(upsampled) = 3/4 * s1A + 1/4 * s0A
* p33(upsampled) = 3/4 * s1C + 1/4 * s2C
* When computing the corner pixel component values, there is no adjacent
* sample to blend, so:
* p0(upsampled) = s0A
* p35(upsampled) = s2C
*/
void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,
JDIMENSION downsampled_width,
JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr)
{
JSAMPARRAY output_data = *output_data_ptr;
JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
int inrow, outrow;
unsigned colctr;
/* Set up constants. */
const uint16x8_t seven_u16 = vdupq_n_u16(7);
const uint8x8_t three_u8 = vdup_n_u8(3);
const uint16x8_t three_u16 = vdupq_n_u16(3);
inrow = outrow = 0;
while (outrow < max_v_samp_factor) {
inptr0 = input_data[inrow - 1];
inptr1 = input_data[inrow];
inptr2 = input_data[inrow + 1];
/* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
* respectively.
*/
outptr0 = output_data[outrow++];
outptr1 = output_data[outrow++];
/* First pixel component value in this row of the original image */
int s0colsum0 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr0);
*outptr0 = (JSAMPLE)((s0colsum0 * 4 + 8) >> 4);
int s0colsum1 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr2);
*outptr1 = (JSAMPLE)((s0colsum1 * 4 + 8) >> 4);
/* Step 1: Blend samples vertically in columns s0 and s1.
* Leave the divide by 4 until the end, when it can be done for both
* dimensions at once, right-shifting by 4.
*/
/* Load and compute s0colsum0 and s0colsum1. */
uint8x16_t s0A = vld1q_u8(inptr0);
uint8x16_t s0B = vld1q_u8(inptr1);
uint8x16_t s0C = vld1q_u8(inptr2);
/* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes
* denote low half and high half respectively.
*/
uint16x8_t s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)),
vget_low_u8(s0B), three_u8);
uint16x8_t s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)),
vget_high_u8(s0B), three_u8);
uint16x8_t s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)),
vget_low_u8(s0B), three_u8);
uint16x8_t s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)),
vget_high_u8(s0B), three_u8);
/* Load and compute s1colsum0 and s1colsum1. */
uint8x16_t s1A = vld1q_u8(inptr0 + 1);
uint8x16_t s1B = vld1q_u8(inptr1 + 1);
uint8x16_t s1C = vld1q_u8(inptr2 + 1);
uint16x8_t s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)),
vget_low_u8(s1B), three_u8);
uint16x8_t s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)),
vget_high_u8(s1B), three_u8);
uint16x8_t s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)),
vget_low_u8(s1B), three_u8);
uint16x8_t s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)),
vget_high_u8(s1B), three_u8);
/* Step 2: Blend the already-blended columns. */
uint16x8_t output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
uint16x8_t output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
uint16x8_t output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
uint16x8_t output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
uint16x8_t output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
uint16x8_t output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
uint16x8_t output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
uint16x8_t output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
/* Add ordered dithering bias to odd pixel values. */
output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
/* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
uint8x16x2_t output_pixels0 = { {
vcombine_u8(vshrn_n_u16(output0_p1_l, 4), vshrn_n_u16(output0_p1_h, 4)),
vcombine_u8(vrshrn_n_u16(output0_p2_l, 4), vrshrn_n_u16(output0_p2_h, 4))
} };
uint8x16x2_t output_pixels1 = { {
vcombine_u8(vshrn_n_u16(output1_p1_l, 4), vshrn_n_u16(output1_p1_h, 4)),
vcombine_u8(vrshrn_n_u16(output1_p2_l, 4), vrshrn_n_u16(output1_p2_h, 4))
} };
/* Store pixel component values to memory.
* The minimum size of the output buffer for each row is 64 bytes => no
* need to worry about buffer overflow here. See "Creation of 2-D sample
* arrays" in jmemmgr.c for more details.
*/
vst2q_u8(outptr0 + 1, output_pixels0);
vst2q_u8(outptr1 + 1, output_pixels1);
/* The first pixel of the image shifted our loads and stores by one byte.
* We have to re-align on a 32-byte boundary at some point before the end
* of the row (we do it now on the 32/33 pixel boundary) to stay within the
* bounds of the sample buffers without having to resort to a slow scalar
* tail case for the last (downsampled_width % 16) samples. See "Creation
* of 2-D sample arrays" in jmemmgr.c for more details.
*/
for (colctr = 16; colctr < downsampled_width; colctr += 16) {
/* Step 1: Blend samples vertically in columns s0 and s1. */
/* Load and compute s0colsum0 and s0colsum1. */
s0A = vld1q_u8(inptr0 + colctr - 1);
s0B = vld1q_u8(inptr1 + colctr - 1);
s0C = vld1q_u8(inptr2 + colctr - 1);
s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)), vget_low_u8(s0B),
three_u8);
s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)), vget_high_u8(s0B),
three_u8);
s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)), vget_low_u8(s0B),
three_u8);
s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)), vget_high_u8(s0B),
three_u8);
/* Load and compute s1colsum0 and s1colsum1. */
s1A = vld1q_u8(inptr0 + colctr);
s1B = vld1q_u8(inptr1 + colctr);
s1C = vld1q_u8(inptr2 + colctr);
s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)), vget_low_u8(s1B),
three_u8);
s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)), vget_high_u8(s1B),
three_u8);
s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)), vget_low_u8(s1B),
three_u8);
s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)), vget_high_u8(s1B),
three_u8);
/* Step 2: Blend the already-blended columns. */
output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
/* Add ordered dithering bias to odd pixel values. */
output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
/* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
output_pixels0.val[0] = vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
vshrn_n_u16(output0_p1_h, 4));
output_pixels0.val[1] = vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
vrshrn_n_u16(output0_p2_h, 4));
output_pixels1.val[0] = vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
vshrn_n_u16(output1_p1_h, 4));
output_pixels1.val[1] = vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
vrshrn_n_u16(output1_p2_h, 4));
/* Store pixel component values to memory. */
vst2q_u8(outptr0 + 2 * colctr - 1, output_pixels0);
vst2q_u8(outptr1 + 2 * colctr - 1, output_pixels1);
}
/* Last pixel component value in this row of the original image */
int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
GETJSAMPLE(inptr0[downsampled_width - 1]);
outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4);
int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
GETJSAMPLE(inptr2[downsampled_width - 1]);
outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4);
inrow++;
}
}
/* The diagram below shows a column of samples produced by h1v2 downsampling
* (or by losslessly rotating or transposing an h2v1-downsampled image.)
*
* +---------+
* | p0 |
* sA | |
* | p1 |
* +---------+
* | p2 |
* sB | |
* | p3 |
* +---------+
* | p4 |
* sC | |
* | p5 |
* +---------+
*
* Samples sA-sC were created by averaging the original pixel component values
* centered at positions p0-p5 above. To approximate those original pixel
* component values, we proportionally blend the adjacent samples in each
* column.
*
* An upsampled pixel component value is computed by blending the sample
* containing the pixel center with the nearest neighboring sample, in the
* ratio 3:1. For example:
* p1(upsampled) = 3/4 * sA + 1/4 * sB
* p2(upsampled) = 3/4 * sB + 1/4 * sA
* When computing the first and last pixel component values in the column,
* there is no adjacent sample to blend, so:
* p0(upsampled) = sA
* p5(upsampled) = sC
*/
void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,
JDIMENSION downsampled_width,
JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr)
{
JSAMPARRAY output_data = *output_data_ptr;
JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
int inrow, outrow;
unsigned colctr;
/* Set up constants. */
const uint16x8_t one_u16 = vdupq_n_u16(1);
const uint8x8_t three_u8 = vdup_n_u8(3);
inrow = outrow = 0;
while (outrow < max_v_samp_factor) {
inptr0 = input_data[inrow - 1];
inptr1 = input_data[inrow];
inptr2 = input_data[inrow + 1];
/* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
* respectively.
*/
outptr0 = output_data[outrow++];
outptr1 = output_data[outrow++];
inrow++;
/* The size of the input and output buffers is always a multiple of 32
* bytes => no need to worry about buffer overflow when reading/writing
* memory. See "Creation of 2-D sample arrays" in jmemmgr.c for more
* details.
*/
for (colctr = 0; colctr < downsampled_width; colctr += 16) {
/* Load samples. */
uint8x16_t sA = vld1q_u8(inptr0 + colctr);
uint8x16_t sB = vld1q_u8(inptr1 + colctr);
uint8x16_t sC = vld1q_u8(inptr2 + colctr);
/* Blend samples vertically. */
uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(sA)),
vget_low_u8(sB), three_u8);
uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(sA)),
vget_high_u8(sB), three_u8);
uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(sC)),
vget_low_u8(sB), three_u8);
uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(sC)),
vget_high_u8(sB), three_u8);
/* Add ordered dithering bias to pixel values in even output rows. */
colsum0_l = vaddq_u16(colsum0_l, one_u16);
colsum0_h = vaddq_u16(colsum0_h, one_u16);
/* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2),
vshrn_n_u16(colsum0_h, 2));
uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2),
vrshrn_n_u16(colsum1_h, 2));
/* Store pixel component values to memory. */
vst1q_u8(outptr0 + colctr, output_pixels0);
vst1q_u8(outptr1 + colctr, output_pixels1);
}
}
}
/* The diagram below shows a row of samples produced by h2v1 downsampling.
*
* s0 s1
* +---------+---------+
* | | |
* | p0 p1 | p2 p3 |
* | | |
* +---------+---------+
*
* Samples s0 and s1 were created by averaging the original pixel component
* values centered at positions p0-p3 above. To approximate those original
* pixel component values, we duplicate the samples horizontally:
* p0(upsampled) = p1(upsampled) = s0
* p2(upsampled) = p3(upsampled) = s1
*/
void jsimd_h2v1_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr)
{
JSAMPARRAY output_data = *output_data_ptr;
JSAMPROW inptr, outptr;
int inrow;
unsigned colctr;
for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
inptr = input_data[inrow];
outptr = output_data[inrow];
for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
uint8x16_t samples = vld1q_u8(inptr + colctr);
/* Duplicate the samples. The store operation below interleaves them so
* that adjacent pixel component values take on the same sample value,
* per above.
*/
uint8x16x2_t output_pixels = { { samples, samples } };
/* Store pixel component values to memory.
* Due to the way sample buffers are allocated, we don't need to worry
* about tail cases when output_width is not a multiple of 32. See
* "Creation of 2-D sample arrays" in jmemmgr.c for details.
*/
vst2q_u8(outptr + 2 * colctr, output_pixels);
}
}
}
/* The diagram below shows an array of samples produced by h2v2 downsampling.
*
* s0 s1
* +---------+---------+
* | p0 p1 | p2 p3 |
* sA | | |
* | p4 p5 | p6 p7 |
* +---------+---------+
* | p8 p9 | p10 p11|
* sB | | |
* | p12 p13| p14 p15|
* +---------+---------+
*
* Samples s0A-s1B were created by averaging the original pixel component
* values centered at positions p0-p15 above. To approximate those original
* pixel component values, we duplicate the samples both horizontally and
* vertically:
* p0(upsampled) = p1(upsampled) = p4(upsampled) = p5(upsampled) = s0A
* p2(upsampled) = p3(upsampled) = p6(upsampled) = p7(upsampled) = s1A
* p8(upsampled) = p9(upsampled) = p12(upsampled) = p13(upsampled) = s0B
* p10(upsampled) = p11(upsampled) = p14(upsampled) = p15(upsampled) = s1B
*/
void jsimd_h2v2_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr)
{
JSAMPARRAY output_data = *output_data_ptr;
JSAMPROW inptr, outptr0, outptr1;
int inrow, outrow;
unsigned colctr;
for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
inptr = input_data[inrow];
outptr0 = output_data[outrow++];
outptr1 = output_data[outrow++];
for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
uint8x16_t samples = vld1q_u8(inptr + colctr);
/* Duplicate the samples. The store operation below interleaves them so
* that adjacent pixel component values take on the same sample value,
* per above.
*/
uint8x16x2_t output_pixels = { { samples, samples } };
/* Store pixel component values for both output rows to memory.
* Due to the way sample buffers are allocated, we don't need to worry
* about tail cases when output_width is not a multiple of 32. See
* "Creation of 2-D sample arrays" in jmemmgr.c for details.
*/
vst2q_u8(outptr0 + 2 * colctr, output_pixels);
vst2q_u8(outptr1 + 2 * colctr, output_pixels);
}
}
}

@ -0,0 +1,214 @@
/*
* jfdctfst-neon.c - fast integer FDCT (Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#define JPEG_INTERNALS
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
#include "align.h"
#include <arm_neon.h>
/* jsimd_fdct_ifast_neon() performs a fast, not so accurate forward DCT
* (Discrete Cosine Transform) on one block of samples. It uses the same
* calculations and produces exactly the same output as IJG's original
* jpeg_fdct_ifast() function, which can be found in jfdctfst.c.
*
* Scaled integer constants are used to avoid floating-point arithmetic:
* 0.382683433 = 12544 * 2^-15
* 0.541196100 = 17795 * 2^-15
* 0.707106781 = 23168 * 2^-15
* 0.306562965 = 9984 * 2^-15
*
* See jfdctfst.c for further details of the DCT algorithm. Where possible,
* the variable names and comments here in jsimd_fdct_ifast_neon() match up
* with those in jpeg_fdct_ifast().
*/
#define F_0_382 12544
#define F_0_541 17792
#define F_0_707 23168
#define F_0_306 9984
ALIGN(16) static const int16_t jsimd_fdct_ifast_neon_consts[] = {
F_0_382, F_0_541, F_0_707, F_0_306
};
void jsimd_fdct_ifast_neon(DCTELEM *data)
{
/* Load an 8x8 block of samples into Neon registers. De-interleaving loads
* are used, followed by vuzp to transpose the block such that we have a
* column of samples per vector - allowing all rows to be processed at once.
*/
int16x8x4_t data1 = vld4q_s16(data);
int16x8x4_t data2 = vld4q_s16(data + 4 * DCTSIZE);
int16x8x2_t cols_04 = vuzpq_s16(data1.val[0], data2.val[0]);
int16x8x2_t cols_15 = vuzpq_s16(data1.val[1], data2.val[1]);
int16x8x2_t cols_26 = vuzpq_s16(data1.val[2], data2.val[2]);
int16x8x2_t cols_37 = vuzpq_s16(data1.val[3], data2.val[3]);
int16x8_t col0 = cols_04.val[0];
int16x8_t col1 = cols_15.val[0];
int16x8_t col2 = cols_26.val[0];
int16x8_t col3 = cols_37.val[0];
int16x8_t col4 = cols_04.val[1];
int16x8_t col5 = cols_15.val[1];
int16x8_t col6 = cols_26.val[1];
int16x8_t col7 = cols_37.val[1];
/* Pass 1: process rows. */
/* Load DCT conversion constants. */
const int16x4_t consts = vld1_s16(jsimd_fdct_ifast_neon_consts);
int16x8_t tmp0 = vaddq_s16(col0, col7);
int16x8_t tmp7 = vsubq_s16(col0, col7);
int16x8_t tmp1 = vaddq_s16(col1, col6);
int16x8_t tmp6 = vsubq_s16(col1, col6);
int16x8_t tmp2 = vaddq_s16(col2, col5);
int16x8_t tmp5 = vsubq_s16(col2, col5);
int16x8_t tmp3 = vaddq_s16(col3, col4);
int16x8_t tmp4 = vsubq_s16(col3, col4);
/* Even part */
int16x8_t tmp10 = vaddq_s16(tmp0, tmp3); /* phase 2 */
int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
col0 = vaddq_s16(tmp10, tmp11); /* phase 3 */
col4 = vsubq_s16(tmp10, tmp11);
int16x8_t z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
col2 = vaddq_s16(tmp13, z1); /* phase 5 */
col6 = vsubq_s16(tmp13, z1);
/* Odd part */
tmp10 = vaddq_s16(tmp4, tmp5); /* phase 2 */
tmp11 = vaddq_s16(tmp5, tmp6);
tmp12 = vaddq_s16(tmp6, tmp7);
int16x8_t z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
int16x8_t z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
z2 = vaddq_s16(z2, z5);
int16x8_t z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
z5 = vaddq_s16(tmp12, z5);
z4 = vaddq_s16(z4, z5);
int16x8_t z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
int16x8_t z11 = vaddq_s16(tmp7, z3); /* phase 5 */
int16x8_t z13 = vsubq_s16(tmp7, z3);
col5 = vaddq_s16(z13, z2); /* phase 6 */
col3 = vsubq_s16(z13, z2);
col1 = vaddq_s16(z11, z4);
col7 = vsubq_s16(z11, z4);
/* Transpose to work on columns in pass 2. */
int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
vreinterpretq_s32_s16(cols_45.val[0]));
int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
vreinterpretq_s32_s16(cols_45.val[1]));
int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
vreinterpretq_s32_s16(cols_67.val[0]));
int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
vreinterpretq_s32_s16(cols_67.val[1]));
int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
/* Pass 2: process columns. */
tmp0 = vaddq_s16(row0, row7);
tmp7 = vsubq_s16(row0, row7);
tmp1 = vaddq_s16(row1, row6);
tmp6 = vsubq_s16(row1, row6);
tmp2 = vaddq_s16(row2, row5);
tmp5 = vsubq_s16(row2, row5);
tmp3 = vaddq_s16(row3, row4);
tmp4 = vsubq_s16(row3, row4);
/* Even part */
tmp10 = vaddq_s16(tmp0, tmp3); /* phase 2 */
tmp13 = vsubq_s16(tmp0, tmp3);
tmp11 = vaddq_s16(tmp1, tmp2);
tmp12 = vsubq_s16(tmp1, tmp2);
row0 = vaddq_s16(tmp10, tmp11); /* phase 3 */
row4 = vsubq_s16(tmp10, tmp11);
z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
row2 = vaddq_s16(tmp13, z1); /* phase 5 */
row6 = vsubq_s16(tmp13, z1);
/* Odd part */
tmp10 = vaddq_s16(tmp4, tmp5); /* phase 2 */
tmp11 = vaddq_s16(tmp5, tmp6);
tmp12 = vaddq_s16(tmp6, tmp7);
z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
z2 = vaddq_s16(z2, z5);
z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
z5 = vaddq_s16(tmp12, z5);
z4 = vaddq_s16(z4, z5);
z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
z11 = vaddq_s16(tmp7, z3); /* phase 5 */
z13 = vsubq_s16(tmp7, z3);
row5 = vaddq_s16(z13, z2); /* phase 6 */
row3 = vsubq_s16(z13, z2);
row1 = vaddq_s16(z11, z4);
row7 = vsubq_s16(z11, z4);
vst1q_s16(data + 0 * DCTSIZE, row0);
vst1q_s16(data + 1 * DCTSIZE, row1);
vst1q_s16(data + 2 * DCTSIZE, row2);
vst1q_s16(data + 3 * DCTSIZE, row3);
vst1q_s16(data + 4 * DCTSIZE, row4);
vst1q_s16(data + 5 * DCTSIZE, row5);
vst1q_s16(data + 6 * DCTSIZE, row6);
vst1q_s16(data + 7 * DCTSIZE, row7);
}

@ -0,0 +1,376 @@
/*
* jfdctint-neon.c - accurate integer FDCT (Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#define JPEG_INTERNALS
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
#include "align.h"
#include "neon-compat.h"
#include <arm_neon.h>
/* jsimd_fdct_islow_neon() performs a slower but more accurate forward DCT
* (Discrete Cosine Transform) on one block of samples. It uses the same
* calculations and produces exactly the same output as IJG's original
* jpeg_fdct_islow() function, which can be found in jfdctint.c.
*
* Scaled integer constants are used to avoid floating-point arithmetic:
* 0.298631336 = 2446 * 2^-13
* 0.390180644 = 3196 * 2^-13
* 0.541196100 = 4433 * 2^-13
* 0.765366865 = 6270 * 2^-13
* 0.899976223 = 7373 * 2^-13
* 1.175875602 = 9633 * 2^-13
* 1.501321110 = 12299 * 2^-13
* 1.847759065 = 15137 * 2^-13
* 1.961570560 = 16069 * 2^-13
* 2.053119869 = 16819 * 2^-13
* 2.562915447 = 20995 * 2^-13
* 3.072711026 = 25172 * 2^-13
*
* See jfdctint.c for further details of the DCT algorithm. Where possible,
* the variable names and comments here in jsimd_fdct_islow_neon() match up
* with those in jpeg_fdct_islow().
*/
#define CONST_BITS 13
#define PASS1_BITS 2
#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
#define F_0_298 2446
#define F_0_390 3196
#define F_0_541 4433
#define F_0_765 6270
#define F_0_899 7373
#define F_1_175 9633
#define F_1_501 12299
#define F_1_847 15137
#define F_1_961 16069
#define F_2_053 16819
#define F_2_562 20995
#define F_3_072 25172
ALIGN(16) static const int16_t jsimd_fdct_islow_neon_consts[] = {
F_0_298, -F_0_390, F_0_541, F_0_765,
-F_0_899, F_1_175, F_1_501, -F_1_847,
-F_1_961, F_2_053, -F_2_562, F_3_072
};
void jsimd_fdct_islow_neon(DCTELEM *data)
{
/* Load DCT constants. */
#ifdef HAVE_VLD1_S16_X3
const int16x4x3_t consts = vld1_s16_x3(jsimd_fdct_islow_neon_consts);
#else
/* GCC does not currently support the intrinsic vld1_<type>_x3(). */
const int16x4_t consts1 = vld1_s16(jsimd_fdct_islow_neon_consts);
const int16x4_t consts2 = vld1_s16(jsimd_fdct_islow_neon_consts + 4);
const int16x4_t consts3 = vld1_s16(jsimd_fdct_islow_neon_consts + 8);
const int16x4x3_t consts = { { consts1, consts2, consts3 } };
#endif
/* Load an 8x8 block of samples into Neon registers. De-interleaving loads
* are used, followed by vuzp to transpose the block such that we have a
* column of samples per vector - allowing all rows to be processed at once.
*/
int16x8x4_t s_rows_0123 = vld4q_s16(data);
int16x8x4_t s_rows_4567 = vld4q_s16(data + 4 * DCTSIZE);
int16x8x2_t cols_04 = vuzpq_s16(s_rows_0123.val[0], s_rows_4567.val[0]);
int16x8x2_t cols_15 = vuzpq_s16(s_rows_0123.val[1], s_rows_4567.val[1]);
int16x8x2_t cols_26 = vuzpq_s16(s_rows_0123.val[2], s_rows_4567.val[2]);
int16x8x2_t cols_37 = vuzpq_s16(s_rows_0123.val[3], s_rows_4567.val[3]);
int16x8_t col0 = cols_04.val[0];
int16x8_t col1 = cols_15.val[0];
int16x8_t col2 = cols_26.val[0];
int16x8_t col3 = cols_37.val[0];
int16x8_t col4 = cols_04.val[1];
int16x8_t col5 = cols_15.val[1];
int16x8_t col6 = cols_26.val[1];
int16x8_t col7 = cols_37.val[1];
/* Pass 1: process rows. */
int16x8_t tmp0 = vaddq_s16(col0, col7);
int16x8_t tmp7 = vsubq_s16(col0, col7);
int16x8_t tmp1 = vaddq_s16(col1, col6);
int16x8_t tmp6 = vsubq_s16(col1, col6);
int16x8_t tmp2 = vaddq_s16(col2, col5);
int16x8_t tmp5 = vsubq_s16(col2, col5);
int16x8_t tmp3 = vaddq_s16(col3, col4);
int16x8_t tmp4 = vsubq_s16(col3, col4);
/* Even part */
int16x8_t tmp10 = vaddq_s16(tmp0, tmp3);
int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
col0 = vshlq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
col4 = vshlq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
int16x8_t tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
int32x4_t z1_l =
vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
int32x4_t z1_h =
vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
int32x4_t col2_scaled_l =
vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
int32x4_t col2_scaled_h =
vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
col2 = vcombine_s16(vrshrn_n_s32(col2_scaled_l, DESCALE_P1),
vrshrn_n_s32(col2_scaled_h, DESCALE_P1));
int32x4_t col6_scaled_l =
vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
int32x4_t col6_scaled_h =
vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
col6 = vcombine_s16(vrshrn_n_s32(col6_scaled_l, DESCALE_P1),
vrshrn_n_s32(col6_scaled_h, DESCALE_P1));
/* Odd part */
int16x8_t z1 = vaddq_s16(tmp4, tmp7);
int16x8_t z2 = vaddq_s16(tmp5, tmp6);
int16x8_t z3 = vaddq_s16(tmp4, tmp6);
int16x8_t z4 = vaddq_s16(tmp5, tmp7);
/* sqrt(2) * c3 */
int32x4_t z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
int32x4_t z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
/* sqrt(2) * (-c1+c3+c5-c7) */
int32x4_t tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
int32x4_t tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
/* sqrt(2) * ( c1+c3-c5+c7) */
int32x4_t tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
int32x4_t tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
/* sqrt(2) * ( c1+c3+c5-c7) */
int32x4_t tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
int32x4_t tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
/* sqrt(2) * ( c1+c3-c5-c7) */
int32x4_t tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
int32x4_t tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
/* sqrt(2) * (c7-c3) */
z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
/* sqrt(2) * (-c1-c3) */
int32x4_t z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
int32x4_t z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
/* sqrt(2) * (-c3-c5) */
int32x4_t z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
int32x4_t z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
/* sqrt(2) * (c5-c3) */
int32x4_t z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
int32x4_t z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
z3_l = vaddq_s32(z3_l, z5_l);
z3_h = vaddq_s32(z3_h, z5_h);
z4_l = vaddq_s32(z4_l, z5_l);
z4_h = vaddq_s32(z4_h, z5_h);
tmp4_l = vaddq_s32(tmp4_l, z1_l);
tmp4_h = vaddq_s32(tmp4_h, z1_h);
tmp4_l = vaddq_s32(tmp4_l, z3_l);
tmp4_h = vaddq_s32(tmp4_h, z3_h);
col7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P1),
vrshrn_n_s32(tmp4_h, DESCALE_P1));
tmp5_l = vaddq_s32(tmp5_l, z2_l);
tmp5_h = vaddq_s32(tmp5_h, z2_h);
tmp5_l = vaddq_s32(tmp5_l, z4_l);
tmp5_h = vaddq_s32(tmp5_h, z4_h);
col5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P1),
vrshrn_n_s32(tmp5_h, DESCALE_P1));
tmp6_l = vaddq_s32(tmp6_l, z2_l);
tmp6_h = vaddq_s32(tmp6_h, z2_h);
tmp6_l = vaddq_s32(tmp6_l, z3_l);
tmp6_h = vaddq_s32(tmp6_h, z3_h);
col3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P1),
vrshrn_n_s32(tmp6_h, DESCALE_P1));
tmp7_l = vaddq_s32(tmp7_l, z1_l);
tmp7_h = vaddq_s32(tmp7_h, z1_h);
tmp7_l = vaddq_s32(tmp7_l, z4_l);
tmp7_h = vaddq_s32(tmp7_h, z4_h);
col1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P1),
vrshrn_n_s32(tmp7_h, DESCALE_P1));
/* Transpose to work on columns in pass 2. */
int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
vreinterpretq_s32_s16(cols_45.val[0]));
int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
vreinterpretq_s32_s16(cols_45.val[1]));
int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
vreinterpretq_s32_s16(cols_67.val[0]));
int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
vreinterpretq_s32_s16(cols_67.val[1]));
int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
/* Pass 2: process columns. */
tmp0 = vaddq_s16(row0, row7);
tmp7 = vsubq_s16(row0, row7);
tmp1 = vaddq_s16(row1, row6);
tmp6 = vsubq_s16(row1, row6);
tmp2 = vaddq_s16(row2, row5);
tmp5 = vsubq_s16(row2, row5);
tmp3 = vaddq_s16(row3, row4);
tmp4 = vsubq_s16(row3, row4);
/* Even part */
tmp10 = vaddq_s16(tmp0, tmp3);
tmp13 = vsubq_s16(tmp0, tmp3);
tmp11 = vaddq_s16(tmp1, tmp2);
tmp12 = vsubq_s16(tmp1, tmp2);
row0 = vrshrq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
row4 = vrshrq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
z1_l = vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
z1_h = vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
int32x4_t row2_scaled_l =
vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
int32x4_t row2_scaled_h =
vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
row2 = vcombine_s16(vrshrn_n_s32(row2_scaled_l, DESCALE_P2),
vrshrn_n_s32(row2_scaled_h, DESCALE_P2));
int32x4_t row6_scaled_l =
vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
int32x4_t row6_scaled_h =
vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
row6 = vcombine_s16(vrshrn_n_s32(row6_scaled_l, DESCALE_P2),
vrshrn_n_s32(row6_scaled_h, DESCALE_P2));
/* Odd part */
z1 = vaddq_s16(tmp4, tmp7);
z2 = vaddq_s16(tmp5, tmp6);
z3 = vaddq_s16(tmp4, tmp6);
z4 = vaddq_s16(tmp5, tmp7);
/* sqrt(2) * c3 */
z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
/* sqrt(2) * (-c1+c3+c5-c7) */
tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
/* sqrt(2) * ( c1+c3-c5+c7) */
tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
/* sqrt(2) * ( c1+c3+c5-c7) */
tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
/* sqrt(2) * ( c1+c3-c5-c7) */
tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
/* sqrt(2) * (c7-c3) */
z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
/* sqrt(2) * (-c1-c3) */
z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
/* sqrt(2) * (-c3-c5) */
z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
/* sqrt(2) * (c5-c3) */
z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
z3_l = vaddq_s32(z3_l, z5_l);
z3_h = vaddq_s32(z3_h, z5_h);
z4_l = vaddq_s32(z4_l, z5_l);
z4_h = vaddq_s32(z4_h, z5_h);
tmp4_l = vaddq_s32(tmp4_l, z1_l);
tmp4_h = vaddq_s32(tmp4_h, z1_h);
tmp4_l = vaddq_s32(tmp4_l, z3_l);
tmp4_h = vaddq_s32(tmp4_h, z3_h);
row7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P2),
vrshrn_n_s32(tmp4_h, DESCALE_P2));
tmp5_l = vaddq_s32(tmp5_l, z2_l);
tmp5_h = vaddq_s32(tmp5_h, z2_h);
tmp5_l = vaddq_s32(tmp5_l, z4_l);
tmp5_h = vaddq_s32(tmp5_h, z4_h);
row5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P2),
vrshrn_n_s32(tmp5_h, DESCALE_P2));
tmp6_l = vaddq_s32(tmp6_l, z2_l);
tmp6_h = vaddq_s32(tmp6_h, z2_h);
tmp6_l = vaddq_s32(tmp6_l, z3_l);
tmp6_h = vaddq_s32(tmp6_h, z3_h);
row3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P2),
vrshrn_n_s32(tmp6_h, DESCALE_P2));
tmp7_l = vaddq_s32(tmp7_l, z1_l);
tmp7_h = vaddq_s32(tmp7_h, z1_h);
tmp7_l = vaddq_s32(tmp7_l, z4_l);
tmp7_h = vaddq_s32(tmp7_h, z4_h);
row1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P2),
vrshrn_n_s32(tmp7_h, DESCALE_P2));
vst1q_s16(data + 0 * DCTSIZE, row0);
vst1q_s16(data + 1 * DCTSIZE, row1);
vst1q_s16(data + 2 * DCTSIZE, row2);
vst1q_s16(data + 3 * DCTSIZE, row3);
vst1q_s16(data + 4 * DCTSIZE, row4);
vst1q_s16(data + 5 * DCTSIZE, row5);
vst1q_s16(data + 6 * DCTSIZE, row6);
vst1q_s16(data + 7 * DCTSIZE, row7);
}

@ -0,0 +1,472 @@
/*
* jidctfst-neon.c - fast integer IDCT (Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#define JPEG_INTERNALS
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
#include "align.h"
#include <arm_neon.h>
/* jsimd_idct_ifast_neon() performs dequantization and a fast, not so accurate
* inverse DCT (Discrete Cosine Transform) on one block of coefficients. It
* uses the same calculations and produces exactly the same output as IJG's
* original jpeg_idct_ifast() function, which can be found in jidctfst.c.
*
* Scaled integer constants are used to avoid floating-point arithmetic:
* 0.082392200 = 2688 * 2^-15
* 0.414213562 = 13568 * 2^-15
* 0.847759065 = 27776 * 2^-15
* 0.613125930 = 20096 * 2^-15
*
* See jidctfst.c for further details of the IDCT algorithm. Where possible,
* the variable names and comments here in jsimd_idct_ifast_neon() match up
* with those in jpeg_idct_ifast().
*/
#define PASS1_BITS 2
#define F_0_082 2688
#define F_0_414 13568
#define F_0_847 27776
#define F_0_613 20096
ALIGN(16) static const int16_t jsimd_idct_ifast_neon_consts[] = {
F_0_082, F_0_414, F_0_847, F_0_613
};
void jsimd_idct_ifast_neon(void *dct_table, JCOEFPTR coef_block,
JSAMPARRAY output_buf, JDIMENSION output_col)
{
IFAST_MULT_TYPE *quantptr = dct_table;
/* Load DCT coefficients. */
int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
int16x8_t row2 = vld1q_s16(coef_block + 2 * DCTSIZE);
int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
int16x8_t row4 = vld1q_s16(coef_block + 4 * DCTSIZE);
int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
int16x8_t row6 = vld1q_s16(coef_block + 6 * DCTSIZE);
int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
/* Load quantization table values for DC coefficients. */
int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
/* Dequantize DC coefficients. */
row0 = vmulq_s16(row0, quant_row0);
/* Construct bitmap to test if all AC coefficients are 0. */
int16x8_t bitmap = vorrq_s16(row1, row2);
bitmap = vorrq_s16(bitmap, row3);
bitmap = vorrq_s16(bitmap, row4);
bitmap = vorrq_s16(bitmap, row5);
bitmap = vorrq_s16(bitmap, row6);
bitmap = vorrq_s16(bitmap, row7);
int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
/* Load IDCT conversion constants. */
const int16x4_t consts = vld1_s16(jsimd_idct_ifast_neon_consts);
if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
/* All AC coefficients are zero.
* Compute DC values and duplicate into vectors.
*/
int16x8_t dcval = row0;
row1 = dcval;
row2 = dcval;
row3 = dcval;
row4 = dcval;
row5 = dcval;
row6 = dcval;
row7 = dcval;
} else if (left_ac_bitmap == 0) {
/* AC coefficients are zero for columns 0, 1, 2, and 3.
* Use DC values for these columns.
*/
int16x4_t dcval = vget_low_s16(row0);
/* Commence regular fast IDCT computation for columns 4, 5, 6, and 7. */
/* Load quantization table. */
int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
/* Even part: dequantize DCT coefficients. */
int16x4_t tmp0 = vget_high_s16(row0);
int16x4_t tmp1 = vmul_s16(vget_high_s16(row2), quant_row2);
int16x4_t tmp2 = vmul_s16(vget_high_s16(row4), quant_row4);
int16x4_t tmp3 = vmul_s16(vget_high_s16(row6), quant_row6);
int16x4_t tmp10 = vadd_s16(tmp0, tmp2); /* phase 3 */
int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
int16x4_t tmp13 = vadd_s16(tmp1, tmp3); /* phases 5-3 */
int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
tmp12 = vsub_s16(tmp12, tmp13);
tmp0 = vadd_s16(tmp10, tmp13); /* phase 2 */
tmp3 = vsub_s16(tmp10, tmp13);
tmp1 = vadd_s16(tmp11, tmp12);
tmp2 = vsub_s16(tmp11, tmp12);
/* Odd part: dequantize DCT coefficients. */
int16x4_t tmp4 = vmul_s16(vget_high_s16(row1), quant_row1);
int16x4_t tmp5 = vmul_s16(vget_high_s16(row3), quant_row3);
int16x4_t tmp6 = vmul_s16(vget_high_s16(row5), quant_row5);
int16x4_t tmp7 = vmul_s16(vget_high_s16(row7), quant_row7);
int16x4_t z13 = vadd_s16(tmp6, tmp5); /* phase 6 */
int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
int16x4_t z11 = vadd_s16(tmp4, tmp7);
int16x4_t z12 = vsub_s16(tmp4, tmp7);
tmp7 = vadd_s16(z11, z13); /* phase 5 */
int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
tmp11 = vadd_s16(tmp11, z11_sub_z13);
int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
z5 = vadd_s16(z5, z10_add_z12);
tmp10 = vqdmulh_lane_s16(z12, consts, 0);
tmp10 = vadd_s16(tmp10, z12);
tmp10 = vsub_s16(tmp10, z5);
tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
tmp12 = vadd_s16(tmp12, z5);
tmp6 = vsub_s16(tmp12, tmp7); /* phase 2 */
tmp5 = vsub_s16(tmp11, tmp6);
tmp4 = vadd_s16(tmp10, tmp5);
row0 = vcombine_s16(dcval, vadd_s16(tmp0, tmp7));
row7 = vcombine_s16(dcval, vsub_s16(tmp0, tmp7));
row1 = vcombine_s16(dcval, vadd_s16(tmp1, tmp6));
row6 = vcombine_s16(dcval, vsub_s16(tmp1, tmp6));
row2 = vcombine_s16(dcval, vadd_s16(tmp2, tmp5));
row5 = vcombine_s16(dcval, vsub_s16(tmp2, tmp5));
row4 = vcombine_s16(dcval, vadd_s16(tmp3, tmp4));
row3 = vcombine_s16(dcval, vsub_s16(tmp3, tmp4));
} else if (right_ac_bitmap == 0) {
/* AC coefficients are zero for columns 4, 5, 6, and 7.
* Use DC values for these columns.
*/
int16x4_t dcval = vget_high_s16(row0);
/* Commence regular fast IDCT computation for columns 0, 1, 2, and 3. */
/* Load quantization table. */
int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
/* Even part: dequantize DCT coefficients. */
int16x4_t tmp0 = vget_low_s16(row0);
int16x4_t tmp1 = vmul_s16(vget_low_s16(row2), quant_row2);
int16x4_t tmp2 = vmul_s16(vget_low_s16(row4), quant_row4);
int16x4_t tmp3 = vmul_s16(vget_low_s16(row6), quant_row6);
int16x4_t tmp10 = vadd_s16(tmp0, tmp2); /* phase 3 */
int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
int16x4_t tmp13 = vadd_s16(tmp1, tmp3); /* phases 5-3 */
int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
tmp12 = vsub_s16(tmp12, tmp13);
tmp0 = vadd_s16(tmp10, tmp13); /* phase 2 */
tmp3 = vsub_s16(tmp10, tmp13);
tmp1 = vadd_s16(tmp11, tmp12);
tmp2 = vsub_s16(tmp11, tmp12);
/* Odd part: dequantize DCT coefficients. */
int16x4_t tmp4 = vmul_s16(vget_low_s16(row1), quant_row1);
int16x4_t tmp5 = vmul_s16(vget_low_s16(row3), quant_row3);
int16x4_t tmp6 = vmul_s16(vget_low_s16(row5), quant_row5);
int16x4_t tmp7 = vmul_s16(vget_low_s16(row7), quant_row7);
int16x4_t z13 = vadd_s16(tmp6, tmp5); /* phase 6 */
int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
int16x4_t z11 = vadd_s16(tmp4, tmp7);
int16x4_t z12 = vsub_s16(tmp4, tmp7);
tmp7 = vadd_s16(z11, z13); /* phase 5 */
int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
tmp11 = vadd_s16(tmp11, z11_sub_z13);
int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
z5 = vadd_s16(z5, z10_add_z12);
tmp10 = vqdmulh_lane_s16(z12, consts, 0);
tmp10 = vadd_s16(tmp10, z12);
tmp10 = vsub_s16(tmp10, z5);
tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
tmp12 = vadd_s16(tmp12, z5);
tmp6 = vsub_s16(tmp12, tmp7); /* phase 2 */
tmp5 = vsub_s16(tmp11, tmp6);
tmp4 = vadd_s16(tmp10, tmp5);
row0 = vcombine_s16(vadd_s16(tmp0, tmp7), dcval);
row7 = vcombine_s16(vsub_s16(tmp0, tmp7), dcval);
row1 = vcombine_s16(vadd_s16(tmp1, tmp6), dcval);
row6 = vcombine_s16(vsub_s16(tmp1, tmp6), dcval);
row2 = vcombine_s16(vadd_s16(tmp2, tmp5), dcval);
row5 = vcombine_s16(vsub_s16(tmp2, tmp5), dcval);
row4 = vcombine_s16(vadd_s16(tmp3, tmp4), dcval);
row3 = vcombine_s16(vsub_s16(tmp3, tmp4), dcval);
} else {
/* Some AC coefficients are non-zero; full IDCT calculation required. */
/* Load quantization table. */
int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
int16x8_t quant_row4 = vld1q_s16(quantptr + 4 * DCTSIZE);
int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
/* Even part: dequantize DCT coefficients. */
int16x8_t tmp0 = row0;
int16x8_t tmp1 = vmulq_s16(row2, quant_row2);
int16x8_t tmp2 = vmulq_s16(row4, quant_row4);
int16x8_t tmp3 = vmulq_s16(row6, quant_row6);
int16x8_t tmp10 = vaddq_s16(tmp0, tmp2); /* phase 3 */
int16x8_t tmp11 = vsubq_s16(tmp0, tmp2);
int16x8_t tmp13 = vaddq_s16(tmp1, tmp3); /* phases 5-3 */
int16x8_t tmp1_sub_tmp3 = vsubq_s16(tmp1, tmp3);
int16x8_t tmp12 = vqdmulhq_lane_s16(tmp1_sub_tmp3, consts, 1);
tmp12 = vaddq_s16(tmp12, tmp1_sub_tmp3);
tmp12 = vsubq_s16(tmp12, tmp13);
tmp0 = vaddq_s16(tmp10, tmp13); /* phase 2 */
tmp3 = vsubq_s16(tmp10, tmp13);
tmp1 = vaddq_s16(tmp11, tmp12);
tmp2 = vsubq_s16(tmp11, tmp12);
/* Odd part: dequantize DCT coefficients. */
int16x8_t tmp4 = vmulq_s16(row1, quant_row1);
int16x8_t tmp5 = vmulq_s16(row3, quant_row3);
int16x8_t tmp6 = vmulq_s16(row5, quant_row5);
int16x8_t tmp7 = vmulq_s16(row7, quant_row7);
int16x8_t z13 = vaddq_s16(tmp6, tmp5); /* phase 6 */
int16x8_t neg_z10 = vsubq_s16(tmp5, tmp6);
int16x8_t z11 = vaddq_s16(tmp4, tmp7);
int16x8_t z12 = vsubq_s16(tmp4, tmp7);
tmp7 = vaddq_s16(z11, z13); /* phase 5 */
int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
tmp11 = vaddq_s16(tmp11, z11_sub_z13);
int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
z5 = vaddq_s16(z5, z10_add_z12);
tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
tmp10 = vaddq_s16(tmp10, z12);
tmp10 = vsubq_s16(tmp10, z5);
tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
tmp12 = vaddq_s16(tmp12, z5);
tmp6 = vsubq_s16(tmp12, tmp7); /* phase 2 */
tmp5 = vsubq_s16(tmp11, tmp6);
tmp4 = vaddq_s16(tmp10, tmp5);
row0 = vaddq_s16(tmp0, tmp7);
row7 = vsubq_s16(tmp0, tmp7);
row1 = vaddq_s16(tmp1, tmp6);
row6 = vsubq_s16(tmp1, tmp6);
row2 = vaddq_s16(tmp2, tmp5);
row5 = vsubq_s16(tmp2, tmp5);
row4 = vaddq_s16(tmp3, tmp4);
row3 = vsubq_s16(tmp3, tmp4);
}
/* Transpose rows to work on columns in pass 2. */
int16x8x2_t rows_01 = vtrnq_s16(row0, row1);
int16x8x2_t rows_23 = vtrnq_s16(row2, row3);
int16x8x2_t rows_45 = vtrnq_s16(row4, row5);
int16x8x2_t rows_67 = vtrnq_s16(row6, row7);
int32x4x2_t rows_0145_l = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[0]),
vreinterpretq_s32_s16(rows_45.val[0]));
int32x4x2_t rows_0145_h = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[1]),
vreinterpretq_s32_s16(rows_45.val[1]));
int32x4x2_t rows_2367_l = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[0]),
vreinterpretq_s32_s16(rows_67.val[0]));
int32x4x2_t rows_2367_h = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[1]),
vreinterpretq_s32_s16(rows_67.val[1]));
int32x4x2_t cols_04 = vzipq_s32(rows_0145_l.val[0], rows_2367_l.val[0]);
int32x4x2_t cols_15 = vzipq_s32(rows_0145_h.val[0], rows_2367_h.val[0]);
int32x4x2_t cols_26 = vzipq_s32(rows_0145_l.val[1], rows_2367_l.val[1]);
int32x4x2_t cols_37 = vzipq_s32(rows_0145_h.val[1], rows_2367_h.val[1]);
int16x8_t col0 = vreinterpretq_s16_s32(cols_04.val[0]);
int16x8_t col1 = vreinterpretq_s16_s32(cols_15.val[0]);
int16x8_t col2 = vreinterpretq_s16_s32(cols_26.val[0]);
int16x8_t col3 = vreinterpretq_s16_s32(cols_37.val[0]);
int16x8_t col4 = vreinterpretq_s16_s32(cols_04.val[1]);
int16x8_t col5 = vreinterpretq_s16_s32(cols_15.val[1]);
int16x8_t col6 = vreinterpretq_s16_s32(cols_26.val[1]);
int16x8_t col7 = vreinterpretq_s16_s32(cols_37.val[1]);
/* 1-D IDCT, pass 2 */
/* Even part */
int16x8_t tmp10 = vaddq_s16(col0, col4);
int16x8_t tmp11 = vsubq_s16(col0, col4);
int16x8_t tmp13 = vaddq_s16(col2, col6);
int16x8_t col2_sub_col6 = vsubq_s16(col2, col6);
int16x8_t tmp12 = vqdmulhq_lane_s16(col2_sub_col6, consts, 1);
tmp12 = vaddq_s16(tmp12, col2_sub_col6);
tmp12 = vsubq_s16(tmp12, tmp13);
int16x8_t tmp0 = vaddq_s16(tmp10, tmp13);
int16x8_t tmp3 = vsubq_s16(tmp10, tmp13);
int16x8_t tmp1 = vaddq_s16(tmp11, tmp12);
int16x8_t tmp2 = vsubq_s16(tmp11, tmp12);
/* Odd part */
int16x8_t z13 = vaddq_s16(col5, col3);
int16x8_t neg_z10 = vsubq_s16(col3, col5);
int16x8_t z11 = vaddq_s16(col1, col7);
int16x8_t z12 = vsubq_s16(col1, col7);
int16x8_t tmp7 = vaddq_s16(z11, z13); /* phase 5 */
int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
tmp11 = vaddq_s16(tmp11, z11_sub_z13);
int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
z5 = vaddq_s16(z5, z10_add_z12);
tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
tmp10 = vaddq_s16(tmp10, z12);
tmp10 = vsubq_s16(tmp10, z5);
tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
tmp12 = vaddq_s16(tmp12, z5);
int16x8_t tmp6 = vsubq_s16(tmp12, tmp7); /* phase 2 */
int16x8_t tmp5 = vsubq_s16(tmp11, tmp6);
int16x8_t tmp4 = vaddq_s16(tmp10, tmp5);
col0 = vaddq_s16(tmp0, tmp7);
col7 = vsubq_s16(tmp0, tmp7);
col1 = vaddq_s16(tmp1, tmp6);
col6 = vsubq_s16(tmp1, tmp6);
col2 = vaddq_s16(tmp2, tmp5);
col5 = vsubq_s16(tmp2, tmp5);
col4 = vaddq_s16(tmp3, tmp4);
col3 = vsubq_s16(tmp3, tmp4);
/* Scale down by a factor of 8, narrowing to 8-bit. */
int8x16_t cols_01_s8 = vcombine_s8(vqshrn_n_s16(col0, PASS1_BITS + 3),
vqshrn_n_s16(col1, PASS1_BITS + 3));
int8x16_t cols_45_s8 = vcombine_s8(vqshrn_n_s16(col4, PASS1_BITS + 3),
vqshrn_n_s16(col5, PASS1_BITS + 3));
int8x16_t cols_23_s8 = vcombine_s8(vqshrn_n_s16(col2, PASS1_BITS + 3),
vqshrn_n_s16(col3, PASS1_BITS + 3));
int8x16_t cols_67_s8 = vcombine_s8(vqshrn_n_s16(col6, PASS1_BITS + 3),
vqshrn_n_s16(col7, PASS1_BITS + 3));
/* Clamp to range [0-255]. */
uint8x16_t cols_01 =
vreinterpretq_u8_s8
(vaddq_s8(cols_01_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
uint8x16_t cols_45 =
vreinterpretq_u8_s8
(vaddq_s8(cols_45_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
uint8x16_t cols_23 =
vreinterpretq_u8_s8
(vaddq_s8(cols_23_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
uint8x16_t cols_67 =
vreinterpretq_u8_s8
(vaddq_s8(cols_67_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
/* Transpose block to prepare for store. */
uint32x4x2_t cols_0415 = vzipq_u32(vreinterpretq_u32_u8(cols_01),
vreinterpretq_u32_u8(cols_45));
uint32x4x2_t cols_2637 = vzipq_u32(vreinterpretq_u32_u8(cols_23),
vreinterpretq_u32_u8(cols_67));
uint8x16x2_t cols_0145 = vtrnq_u8(vreinterpretq_u8_u32(cols_0415.val[0]),
vreinterpretq_u8_u32(cols_0415.val[1]));
uint8x16x2_t cols_2367 = vtrnq_u8(vreinterpretq_u8_u32(cols_2637.val[0]),
vreinterpretq_u8_u32(cols_2637.val[1]));
uint16x8x2_t rows_0426 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[0]),
vreinterpretq_u16_u8(cols_2367.val[0]));
uint16x8x2_t rows_1537 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[1]),
vreinterpretq_u16_u8(cols_2367.val[1]));
uint8x16_t rows_04 = vreinterpretq_u8_u16(rows_0426.val[0]);
uint8x16_t rows_15 = vreinterpretq_u8_u16(rows_1537.val[0]);
uint8x16_t rows_26 = vreinterpretq_u8_u16(rows_0426.val[1]);
uint8x16_t rows_37 = vreinterpretq_u8_u16(rows_1537.val[1]);
JSAMPROW outptr0 = output_buf[0] + output_col;
JSAMPROW outptr1 = output_buf[1] + output_col;
JSAMPROW outptr2 = output_buf[2] + output_col;
JSAMPROW outptr3 = output_buf[3] + output_col;
JSAMPROW outptr4 = output_buf[4] + output_col;
JSAMPROW outptr5 = output_buf[5] + output_col;
JSAMPROW outptr6 = output_buf[6] + output_col;
JSAMPROW outptr7 = output_buf[7] + output_col;
/* Store DCT block to memory. */
vst1q_lane_u64((uint64_t *)outptr0, vreinterpretq_u64_u8(rows_04), 0);
vst1q_lane_u64((uint64_t *)outptr1, vreinterpretq_u64_u8(rows_15), 0);
vst1q_lane_u64((uint64_t *)outptr2, vreinterpretq_u64_u8(rows_26), 0);
vst1q_lane_u64((uint64_t *)outptr3, vreinterpretq_u64_u8(rows_37), 0);
vst1q_lane_u64((uint64_t *)outptr4, vreinterpretq_u64_u8(rows_04), 1);
vst1q_lane_u64((uint64_t *)outptr5, vreinterpretq_u64_u8(rows_15), 1);
vst1q_lane_u64((uint64_t *)outptr6, vreinterpretq_u64_u8(rows_26), 1);
vst1q_lane_u64((uint64_t *)outptr7, vreinterpretq_u64_u8(rows_37), 1);
}

@ -0,0 +1,802 @@
/*
* jidctint-neon.c - accurate integer IDCT (Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#define JPEG_INTERNALS
#include "jconfigint.h"
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
#include "align.h"
#include "neon-compat.h"
#include <arm_neon.h>
#define CONST_BITS 13
#define PASS1_BITS 2
#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
/* The computation of the inverse DCT requires the use of constants known at
* compile time. Scaled integer constants are used to avoid floating-point
* arithmetic:
* 0.298631336 = 2446 * 2^-13
* 0.390180644 = 3196 * 2^-13
* 0.541196100 = 4433 * 2^-13
* 0.765366865 = 6270 * 2^-13
* 0.899976223 = 7373 * 2^-13
* 1.175875602 = 9633 * 2^-13
* 1.501321110 = 12299 * 2^-13
* 1.847759065 = 15137 * 2^-13
* 1.961570560 = 16069 * 2^-13
* 2.053119869 = 16819 * 2^-13
* 2.562915447 = 20995 * 2^-13
* 3.072711026 = 25172 * 2^-13
*/
#define F_0_298 2446
#define F_0_390 3196
#define F_0_541 4433
#define F_0_765 6270
#define F_0_899 7373
#define F_1_175 9633
#define F_1_501 12299
#define F_1_847 15137
#define F_1_961 16069
#define F_2_053 16819
#define F_2_562 20995
#define F_3_072 25172
#define F_1_175_MINUS_1_961 (F_1_175 - F_1_961)
#define F_1_175_MINUS_0_390 (F_1_175 - F_0_390)
#define F_0_541_MINUS_1_847 (F_0_541 - F_1_847)
#define F_3_072_MINUS_2_562 (F_3_072 - F_2_562)
#define F_0_298_MINUS_0_899 (F_0_298 - F_0_899)
#define F_1_501_MINUS_0_899 (F_1_501 - F_0_899)
#define F_2_053_MINUS_2_562 (F_2_053 - F_2_562)
#define F_0_541_PLUS_0_765 (F_0_541 + F_0_765)
ALIGN(16) static const int16_t jsimd_idct_islow_neon_consts[] = {
F_0_899, F_0_541,
F_2_562, F_0_298_MINUS_0_899,
F_1_501_MINUS_0_899, F_2_053_MINUS_2_562,
F_0_541_PLUS_0_765, F_1_175,
F_1_175_MINUS_0_390, F_0_541_MINUS_1_847,
F_3_072_MINUS_2_562, F_1_175_MINUS_1_961,
0, 0, 0, 0
};
/* Forward declaration of regular and sparse IDCT helper functions */
static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
int16x4_t row1,
int16x4_t row2,
int16x4_t row3,
int16x4_t row4,
int16x4_t row5,
int16x4_t row6,
int16x4_t row7,
int16x4_t quant_row0,
int16x4_t quant_row1,
int16x4_t quant_row2,
int16x4_t quant_row3,
int16x4_t quant_row4,
int16x4_t quant_row5,
int16x4_t quant_row6,
int16x4_t quant_row7,
int16_t *workspace_1,
int16_t *workspace_2);
static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
int16x4_t row1,
int16x4_t row2,
int16x4_t row3,
int16x4_t quant_row0,
int16x4_t quant_row1,
int16x4_t quant_row2,
int16x4_t quant_row3,
int16_t *workspace_1,
int16_t *workspace_2);
static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
JSAMPARRAY output_buf,
JDIMENSION output_col,
unsigned buf_offset);
static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
JSAMPARRAY output_buf,
JDIMENSION output_col,
unsigned buf_offset);
/* Perform dequantization and inverse DCT on one block of coefficients. For
* reference, the C implementation (jpeg_idct_slow()) can be found in
* jidctint.c.
*
* Optimization techniques used for fast data access:
*
* In each pass, the inverse DCT is computed for the left and right 4x8 halves
* of the DCT block. This avoids spilling due to register pressure, and the
* increased granularity allows for an optimized calculation depending on the
* values of the DCT coefficients. Between passes, intermediate data is stored
* in 4x8 workspace buffers.
*
* Transposing the 8x8 DCT block after each pass can be achieved by transposing
* each of the four 4x4 quadrants and swapping quadrants 1 and 2 (refer to the
* diagram below.) Swapping quadrants is cheap, since the second pass can just
* swap the workspace buffer pointers.
*
* +-------+-------+ +-------+-------+
* | | | | | |
* | 0 | 1 | | 0 | 2 |
* | | | transpose | | |
* +-------+-------+ ------> +-------+-------+
* | | | | | |
* | 2 | 3 | | 1 | 3 |
* | | | | | |
* +-------+-------+ +-------+-------+
*
* Optimization techniques used to accelerate the inverse DCT calculation:
*
* In a DCT coefficient block, the coefficients are increasingly likely to be 0
* as you move diagonally from top left to bottom right. If whole rows of
* coefficients are 0, then the inverse DCT calculation can be simplified. On
* the first pass of the inverse DCT, we test for three special cases before
* defaulting to a full "regular" inverse DCT:
*
* 1) Coefficients in rows 4-7 are all zero. In this case, we perform a
* "sparse" simplified inverse DCT on rows 0-3.
* 2) AC coefficients (rows 1-7) are all zero. In this case, the inverse DCT
* result is equal to the dequantized DC coefficients.
* 3) AC and DC coefficients are all zero. In this case, the inverse DCT
* result is all zero. For the left 4x8 half, this is handled identically
* to Case 2 above. For the right 4x8 half, we do no work and signal that
* the "sparse" algorithm is required for the second pass.
*
* In the second pass, only a single special case is tested: whether the AC and
* DC coefficients were all zero in the right 4x8 block during the first pass
* (refer to Case 3 above.) If this is the case, then a "sparse" variant of
* the second pass is performed for both the left and right halves of the DCT
* block. (The transposition after the first pass means that the right 4x8
* block during the first pass becomes rows 4-7 during the second pass.)
*/
void jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
JSAMPARRAY output_buf, JDIMENSION output_col)
{
ISLOW_MULT_TYPE *quantptr = dct_table;
int16_t workspace_l[8 * DCTSIZE / 2];
int16_t workspace_r[8 * DCTSIZE / 2];
/* Compute IDCT first pass on left 4x8 coefficient block. */
/* Load DCT coefficients in left 4x8 block. */
int16x4_t row0 = vld1_s16(coef_block + 0 * DCTSIZE);
int16x4_t row1 = vld1_s16(coef_block + 1 * DCTSIZE);
int16x4_t row2 = vld1_s16(coef_block + 2 * DCTSIZE);
int16x4_t row3 = vld1_s16(coef_block + 3 * DCTSIZE);
int16x4_t row4 = vld1_s16(coef_block + 4 * DCTSIZE);
int16x4_t row5 = vld1_s16(coef_block + 5 * DCTSIZE);
int16x4_t row6 = vld1_s16(coef_block + 6 * DCTSIZE);
int16x4_t row7 = vld1_s16(coef_block + 7 * DCTSIZE);
/* Load quantization table for left 4x8 block. */
int16x4_t quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE);
int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
/* Construct bitmap to test if DCT coefficients in left 4x8 block are 0. */
int16x4_t bitmap = vorr_s16(row7, row6);
bitmap = vorr_s16(bitmap, row5);
bitmap = vorr_s16(bitmap, row4);
int64_t bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
if (bitmap_rows_4567 == 0) {
bitmap = vorr_s16(bitmap, row3);
bitmap = vorr_s16(bitmap, row2);
bitmap = vorr_s16(bitmap, row1);
int64_t left_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
if (left_ac_bitmap == 0) {
int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
/* Store 4x4 blocks to workspace, transposing in the process. */
vst4_s16(workspace_l, quadrant);
vst4_s16(workspace_r, quadrant);
} else {
jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
quant_row1, quant_row2, quant_row3,
workspace_l, workspace_r);
}
} else {
jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
row6, row7, quant_row0, quant_row1,
quant_row2, quant_row3, quant_row4,
quant_row5, quant_row6, quant_row7,
workspace_l, workspace_r);
}
/* Compute IDCT first pass on right 4x8 coefficient block. */
/* Load DCT coefficients in right 4x8 block. */
row0 = vld1_s16(coef_block + 0 * DCTSIZE + 4);
row1 = vld1_s16(coef_block + 1 * DCTSIZE + 4);
row2 = vld1_s16(coef_block + 2 * DCTSIZE + 4);
row3 = vld1_s16(coef_block + 3 * DCTSIZE + 4);
row4 = vld1_s16(coef_block + 4 * DCTSIZE + 4);
row5 = vld1_s16(coef_block + 5 * DCTSIZE + 4);
row6 = vld1_s16(coef_block + 6 * DCTSIZE + 4);
row7 = vld1_s16(coef_block + 7 * DCTSIZE + 4);
/* Load quantization table for right 4x8 block. */
quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE + 4);
quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
/* Construct bitmap to test if DCT coefficients in right 4x8 block are 0. */
bitmap = vorr_s16(row7, row6);
bitmap = vorr_s16(bitmap, row5);
bitmap = vorr_s16(bitmap, row4);
bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
bitmap = vorr_s16(bitmap, row3);
bitmap = vorr_s16(bitmap, row2);
bitmap = vorr_s16(bitmap, row1);
int64_t right_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
/* If this remains non-zero, a "regular" second pass will be performed. */
int64_t right_ac_dc_bitmap = 1;
if (right_ac_bitmap == 0) {
bitmap = vorr_s16(bitmap, row0);
right_ac_dc_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
if (right_ac_dc_bitmap != 0) {
int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
/* Store 4x4 blocks to workspace, transposing in the process. */
vst4_s16(workspace_l + 4 * DCTSIZE / 2, quadrant);
vst4_s16(workspace_r + 4 * DCTSIZE / 2, quadrant);
}
} else {
if (bitmap_rows_4567 == 0) {
jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
quant_row1, quant_row2, quant_row3,
workspace_l + 4 * DCTSIZE / 2,
workspace_r + 4 * DCTSIZE / 2);
} else {
jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
row6, row7, quant_row0, quant_row1,
quant_row2, quant_row3, quant_row4,
quant_row5, quant_row6, quant_row7,
workspace_l + 4 * DCTSIZE / 2,
workspace_r + 4 * DCTSIZE / 2);
}
}
/* Second pass: compute IDCT on rows in workspace. */
/* If all coefficients in right 4x8 block are 0, use "sparse" second pass. */
if (right_ac_dc_bitmap == 0) {
jsimd_idct_islow_pass2_sparse(workspace_l, output_buf, output_col, 0);
jsimd_idct_islow_pass2_sparse(workspace_r, output_buf, output_col, 4);
} else {
jsimd_idct_islow_pass2_regular(workspace_l, output_buf, output_col, 0);
jsimd_idct_islow_pass2_regular(workspace_r, output_buf, output_col, 4);
}
}
/* Perform dequantization and the first pass of the accurate inverse DCT on a
* 4x8 block of coefficients. (To process the full 8x8 DCT block, this
* function-- or some other optimized variant-- needs to be called for both the
* left and right 4x8 blocks.)
*
* This "regular" version assumes that no optimization can be made to the IDCT
* calculation, since no useful set of AC coefficients is all 0.
*
* The original C implementation of the accurate IDCT (jpeg_idct_slow()) can be
* found in jidctint.c. Algorithmic changes made here are documented inline.
*/
static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
int16x4_t row1,
int16x4_t row2,
int16x4_t row3,
int16x4_t row4,
int16x4_t row5,
int16x4_t row6,
int16x4_t row7,
int16x4_t quant_row0,
int16x4_t quant_row1,
int16x4_t quant_row2,
int16x4_t quant_row3,
int16x4_t quant_row4,
int16x4_t quant_row5,
int16x4_t quant_row6,
int16x4_t quant_row7,
int16_t *workspace_1,
int16_t *workspace_2)
{
/* Load constants for IDCT computation. */
#ifdef HAVE_VLD1_S16_X3
const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
#else
const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
const int16x4x3_t consts = { { consts1, consts2, consts3 } };
#endif
/* Even part */
int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
int16x4_t z3_s16 = vmul_s16(row6, quant_row6);
int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1);
tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1);
z2_s16 = vmul_s16(row0, quant_row0);
z3_s16 = vmul_s16(row4, quant_row4);
int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS);
int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS);
int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
/* Odd part */
int16x4_t tmp0_s16 = vmul_s16(row7, quant_row7);
int16x4_t tmp1_s16 = vmul_s16(row5, quant_row5);
int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1);
z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
/* Implementation as per jpeg_idct_islow() in jidctint.c:
* z5 = (z3 + z4) * 1.175875602;
* z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
* z3 += z5; z4 += z5;
*
* This implementation:
* z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
* z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
*/
int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
/* Implementation as per jpeg_idct_islow() in jidctint.c:
* z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
* tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
* tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
* z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
* tmp0 += z1 + z3; tmp1 += z2 + z4;
* tmp2 += z2 + z3; tmp3 += z1 + z4;
*
* This implementation:
* tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
* tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
* tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
* tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
* tmp0 += z3; tmp1 += z4;
* tmp2 += z3; tmp3 += z4;
*/
tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3);
tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1);
tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2);
tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0);
tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0);
tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2);
tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2);
tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0);
tmp0 = vaddq_s32(tmp0, z3);
tmp1 = vaddq_s32(tmp1, z4);
tmp2 = vaddq_s32(tmp2, z3);
tmp3 = vaddq_s32(tmp3, z4);
/* Final output stage: descale and narrow to 16-bit. */
int16x4x4_t rows_0123 = { {
vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
} };
int16x4x4_t rows_4567 = { {
vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
} };
/* Store 4x4 blocks to the intermediate workspace, ready for the second pass.
* (VST4 transposes the blocks. We need to operate on rows in the next
* pass.)
*/
vst4_s16(workspace_1, rows_0123);
vst4_s16(workspace_2, rows_4567);
}
/* Perform dequantization and the first pass of the accurate inverse DCT on a
* 4x8 block of coefficients.
*
* This "sparse" version assumes that the AC coefficients in rows 4-7 are all
* 0. This simplifies the IDCT calculation, accelerating overall performance.
*/
static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
int16x4_t row1,
int16x4_t row2,
int16x4_t row3,
int16x4_t quant_row0,
int16x4_t quant_row1,
int16x4_t quant_row2,
int16x4_t quant_row3,
int16_t *workspace_1,
int16_t *workspace_2)
{
/* Load constants for IDCT computation. */
#ifdef HAVE_VLD1_S16_X3
const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
#else
const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
const int16x4x3_t consts = { { consts1, consts2, consts3 } };
#endif
/* Even part (z3 is all 0) */
int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
z2_s16 = vmul_s16(row0, quant_row0);
int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS);
int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS);
int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
/* Odd part (tmp0 and tmp1 are both all 0) */
int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1);
int16x4_t z3_s16 = tmp2_s16;
int16x4_t z4_s16 = tmp3_s16;
int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0);
tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2);
tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2);
tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0);
/* Final output stage: descale and narrow to 16-bit. */
int16x4x4_t rows_0123 = { {
vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
} };
int16x4x4_t rows_4567 = { {
vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
} };
/* Store 4x4 blocks to the intermediate workspace, ready for the second pass.
* (VST4 transposes the blocks. We need to operate on rows in the next
* pass.)
*/
vst4_s16(workspace_1, rows_0123);
vst4_s16(workspace_2, rows_4567);
}
/* Perform the second pass of the accurate inverse DCT on a 4x8 block of
* coefficients. (To process the full 8x8 DCT block, this function-- or some
* other optimized variant-- needs to be called for both the right and left 4x8
* blocks.)
*
* This "regular" version assumes that no optimization can be made to the IDCT
* calculation, since no useful set of coefficient values are all 0 after the
* first pass.
*
* Again, the original C implementation of the accurate IDCT (jpeg_idct_slow())
* can be found in jidctint.c. Algorithmic changes made here are documented
* inline.
*/
static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
JSAMPARRAY output_buf,
JDIMENSION output_col,
unsigned buf_offset)
{
/* Load constants for IDCT computation. */
#ifdef HAVE_VLD1_S16_X3
const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
#else
const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
const int16x4x3_t consts = { { consts1, consts2, consts3 } };
#endif
/* Even part */
int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
int16x4_t z3_s16 = vld1_s16(workspace + 6 * DCTSIZE / 2);
int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1);
tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1);
z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2);
z3_s16 = vld1_s16(workspace + 4 * DCTSIZE / 2);
int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS);
int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS);
int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
/* Odd part */
int16x4_t tmp0_s16 = vld1_s16(workspace + 7 * DCTSIZE / 2);
int16x4_t tmp1_s16 = vld1_s16(workspace + 5 * DCTSIZE / 2);
int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2);
z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
/* Implementation as per jpeg_idct_islow() in jidctint.c:
* z5 = (z3 + z4) * 1.175875602;
* z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
* z3 += z5; z4 += z5;
*
* This implementation:
* z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
* z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
*/
int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
/* Implementation as per jpeg_idct_islow() in jidctint.c:
* z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
* tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
* tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
* z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
* tmp0 += z1 + z3; tmp1 += z2 + z4;
* tmp2 += z2 + z3; tmp3 += z1 + z4;
*
* This implementation:
* tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
* tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
* tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
* tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
* tmp0 += z3; tmp1 += z4;
* tmp2 += z3; tmp3 += z4;
*/
tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3);
tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1);
tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2);
tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0);
tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0);
tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2);
tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2);
tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0);
tmp0 = vaddq_s32(tmp0, z3);
tmp1 = vaddq_s32(tmp1, z4);
tmp2 = vaddq_s32(tmp2, z3);
tmp3 = vaddq_s32(tmp3, z4);
/* Final output stage: descale and narrow to 16-bit. */
int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3),
vaddhn_s32(tmp12, tmp1));
int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2),
vaddhn_s32(tmp13, tmp0));
int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0),
vsubhn_s32(tmp11, tmp2));
int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1),
vsubhn_s32(tmp10, tmp3));
/* Descale and narrow to 8-bit. */
int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16);
int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16);
int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16);
int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16);
/* Clamp to range [0-255]. */
uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8),
vdup_n_u8(CENTERJSAMPLE));
uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8),
vdup_n_u8(CENTERJSAMPLE));
uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8),
vdup_n_u8(CENTERJSAMPLE));
uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8),
vdup_n_u8(CENTERJSAMPLE));
/* Transpose 4x8 block and store to memory. (Zipping adjacent columns
* together allows us to store 16-bit elements.)
*/
uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
uint16x4x4_t cols_01_23_45_67 = { {
vreinterpret_u16_u8(cols_01_23.val[0]),
vreinterpret_u16_u8(cols_01_23.val[1]),
vreinterpret_u16_u8(cols_45_67.val[0]),
vreinterpret_u16_u8(cols_45_67.val[1])
} };
JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
/* VST4 of 16-bit elements completes the transpose. */
vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
}
/* Performs the second pass of the accurate inverse DCT on a 4x8 block
* of coefficients.
*
* This "sparse" version assumes that the coefficient values (after the first
* pass) in rows 4-7 are all 0. This simplifies the IDCT calculation,
* accelerating overall performance.
*/
static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
JSAMPARRAY output_buf,
JDIMENSION output_col,
unsigned buf_offset)
{
/* Load constants for IDCT computation. */
#ifdef HAVE_VLD1_S16_X3
const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
#else
const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
const int16x4x3_t consts = { { consts1, consts2, consts3 } };
#endif
/* Even part (z3 is all 0) */
int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2);
int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS);
int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS);
int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
/* Odd part (tmp0 and tmp1 are both all 0) */
int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2);
int16x4_t z3_s16 = tmp2_s16;
int16x4_t z4_s16 = tmp3_s16;
int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0);
tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2);
tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2);
tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0);
/* Final output stage: descale and narrow to 16-bit. */
int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3),
vaddhn_s32(tmp12, tmp1));
int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2),
vaddhn_s32(tmp13, tmp0));
int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0),
vsubhn_s32(tmp11, tmp2));
int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1),
vsubhn_s32(tmp10, tmp3));
/* Descale and narrow to 8-bit. */
int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16);
int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16);
int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16);
int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16);
/* Clamp to range [0-255]. */
uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8),
vdup_n_u8(CENTERJSAMPLE));
uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8),
vdup_n_u8(CENTERJSAMPLE));
uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8),
vdup_n_u8(CENTERJSAMPLE));
uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8),
vdup_n_u8(CENTERJSAMPLE));
/* Transpose 4x8 block and store to memory. (Zipping adjacent columns
* together allows us to store 16-bit elements.)
*/
uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
uint16x4x4_t cols_01_23_45_67 = { {
vreinterpret_u16_u8(cols_01_23.val[0]),
vreinterpret_u16_u8(cols_01_23.val[1]),
vreinterpret_u16_u8(cols_45_67.val[0]),
vreinterpret_u16_u8(cols_45_67.val[1])
} };
JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
/* VST4 of 16-bit elements completes the transpose. */
vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
}

@ -0,0 +1,486 @@
/*
* jidctred-neon.c - reduced-size IDCT (Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#define JPEG_INTERNALS
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
#include "align.h"
#include "neon-compat.h"
#include <arm_neon.h>
#define CONST_BITS 13
#define PASS1_BITS 2
#define F_0_211 1730
#define F_0_509 4176
#define F_0_601 4926
#define F_0_720 5906
#define F_0_765 6270
#define F_0_850 6967
#define F_0_899 7373
#define F_1_061 8697
#define F_1_272 10426
#define F_1_451 11893
#define F_1_847 15137
#define F_2_172 17799
#define F_2_562 20995
#define F_3_624 29692
/* jsimd_idct_2x2_neon() is an inverse DCT function that produces reduced-size
* 2x2 output from an 8x8 DCT block. It uses the same calculations and
* produces exactly the same output as IJG's original jpeg_idct_2x2() function
* from jpeg-6b, which can be found in jidctred.c.
*
* Scaled integer constants are used to avoid floating-point arithmetic:
* 0.720959822 = 5906 * 2^-13
* 0.850430095 = 6967 * 2^-13
* 1.272758580 = 10426 * 2^-13
* 3.624509785 = 29692 * 2^-13
*
* See jidctred.c for further details of the 2x2 IDCT algorithm. Where
* possible, the variable names and comments here in jsimd_idct_2x2_neon()
* match up with those in jpeg_idct_2x2().
*/
ALIGN(16) static const int16_t jsimd_idct_2x2_neon_consts[] = {
-F_0_720, F_0_850, -F_1_272, F_3_624
};
void jsimd_idct_2x2_neon(void *dct_table, JCOEFPTR coef_block,
JSAMPARRAY output_buf, JDIMENSION output_col)
{
ISLOW_MULT_TYPE *quantptr = dct_table;
/* Load DCT coefficients. */
int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
/* Load quantization table values. */
int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
/* Dequantize DCT coefficients. */
row0 = vmulq_s16(row0, quant_row0);
row1 = vmulq_s16(row1, quant_row1);
row3 = vmulq_s16(row3, quant_row3);
row5 = vmulq_s16(row5, quant_row5);
row7 = vmulq_s16(row7, quant_row7);
/* Load IDCT conversion constants. */
const int16x4_t consts = vld1_s16(jsimd_idct_2x2_neon_consts);
/* Pass 1: process columns from input, put results in vectors row0 and
* row1.
*/
/* Even part */
int32x4_t tmp10_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 2);
int32x4_t tmp10_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 2);
/* Odd part */
int32x4_t tmp0_l = vmull_lane_s16(vget_low_s16(row1), consts, 3);
tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row3), consts, 2);
tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row5), consts, 1);
tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row7), consts, 0);
int32x4_t tmp0_h = vmull_lane_s16(vget_high_s16(row1), consts, 3);
tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row3), consts, 2);
tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row5), consts, 1);
tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row7), consts, 0);
/* Final output stage: descale and narrow to 16-bit. */
row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp0_l), CONST_BITS),
vrshrn_n_s32(vaddq_s32(tmp10_h, tmp0_h), CONST_BITS));
row1 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp0_l), CONST_BITS),
vrshrn_n_s32(vsubq_s32(tmp10_h, tmp0_h), CONST_BITS));
/* Transpose two rows, ready for second pass. */
int16x8x2_t cols_0246_1357 = vtrnq_s16(row0, row1);
int16x8_t cols_0246 = cols_0246_1357.val[0];
int16x8_t cols_1357 = cols_0246_1357.val[1];
/* Duplicate columns such that each is accessible in its own vector. */
int32x4x2_t cols_1155_3377 = vtrnq_s32(vreinterpretq_s32_s16(cols_1357),
vreinterpretq_s32_s16(cols_1357));
int16x8_t cols_1155 = vreinterpretq_s16_s32(cols_1155_3377.val[0]);
int16x8_t cols_3377 = vreinterpretq_s16_s32(cols_1155_3377.val[1]);
/* Pass 2: process two rows, store to output array. */
/* Even part: we're only interested in col0; the top half of tmp10 is "don't
* care."
*/
int32x4_t tmp10 = vshll_n_s16(vget_low_s16(cols_0246), CONST_BITS + 2);
/* Odd part: we're only interested in the bottom half of tmp0. */
int32x4_t tmp0 = vmull_lane_s16(vget_low_s16(cols_1155), consts, 3);
tmp0 = vmlal_lane_s16(tmp0, vget_low_s16(cols_3377), consts, 2);
tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_1155), consts, 1);
tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_3377), consts, 0);
/* Final output stage: descale and clamp to range [0-255]. */
int16x8_t output_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp0),
vsubhn_s32(tmp10, tmp0));
output_s16 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_s16,
CONST_BITS + PASS1_BITS + 3 + 2 - 16);
/* Narrow to 8-bit and convert to unsigned. */
uint8x8_t output_u8 = vqmovun_s16(output_s16);
/* Store 2x2 block to memory. */
vst1_lane_u8(output_buf[0] + output_col, output_u8, 0);
vst1_lane_u8(output_buf[1] + output_col, output_u8, 1);
vst1_lane_u8(output_buf[0] + output_col + 1, output_u8, 4);
vst1_lane_u8(output_buf[1] + output_col + 1, output_u8, 5);
}
/* jsimd_idct_4x4_neon() is an inverse DCT function that produces reduced-size
* 4x4 output from an 8x8 DCT block. It uses the same calculations and
* produces exactly the same output as IJG's original jpeg_idct_4x4() function
* from jpeg-6b, which can be found in jidctred.c.
*
* Scaled integer constants are used to avoid floating-point arithmetic:
* 0.211164243 = 1730 * 2^-13
* 0.509795579 = 4176 * 2^-13
* 0.601344887 = 4926 * 2^-13
* 0.765366865 = 6270 * 2^-13
* 0.899976223 = 7373 * 2^-13
* 1.061594337 = 8697 * 2^-13
* 1.451774981 = 11893 * 2^-13
* 1.847759065 = 15137 * 2^-13
* 2.172734803 = 17799 * 2^-13
* 2.562915447 = 20995 * 2^-13
*
* See jidctred.c for further details of the 4x4 IDCT algorithm. Where
* possible, the variable names and comments here in jsimd_idct_4x4_neon()
* match up with those in jpeg_idct_4x4().
*/
ALIGN(16) static const int16_t jsimd_idct_4x4_neon_consts[] = {
F_1_847, -F_0_765, -F_0_211, F_1_451,
-F_2_172, F_1_061, -F_0_509, -F_0_601,
F_0_899, F_2_562, 0, 0
};
void jsimd_idct_4x4_neon(void *dct_table, JCOEFPTR coef_block,
JSAMPARRAY output_buf, JDIMENSION output_col)
{
ISLOW_MULT_TYPE *quantptr = dct_table;
/* Load DCT coefficients. */
int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
int16x8_t row2 = vld1q_s16(coef_block + 2 * DCTSIZE);
int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
int16x8_t row6 = vld1q_s16(coef_block + 6 * DCTSIZE);
int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
/* Load quantization table values for DC coefficients. */
int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
/* Dequantize DC coefficients. */
row0 = vmulq_s16(row0, quant_row0);
/* Construct bitmap to test if all AC coefficients are 0. */
int16x8_t bitmap = vorrq_s16(row1, row2);
bitmap = vorrq_s16(bitmap, row3);
bitmap = vorrq_s16(bitmap, row5);
bitmap = vorrq_s16(bitmap, row6);
bitmap = vorrq_s16(bitmap, row7);
int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
/* Load constants for IDCT computation. */
#ifdef HAVE_VLD1_S16_X3
const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_4x4_neon_consts);
#else
/* GCC does not currently support the intrinsic vld1_<type>_x3(). */
const int16x4_t consts1 = vld1_s16(jsimd_idct_4x4_neon_consts);
const int16x4_t consts2 = vld1_s16(jsimd_idct_4x4_neon_consts + 4);
const int16x4_t consts3 = vld1_s16(jsimd_idct_4x4_neon_consts + 8);
const int16x4x3_t consts = { { consts1, consts2, consts3 } };
#endif
if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
/* All AC coefficients are zero.
* Compute DC values and duplicate into row vectors 0, 1, 2, and 3.
*/
int16x8_t dcval = vshlq_n_s16(row0, PASS1_BITS);
row0 = dcval;
row1 = dcval;
row2 = dcval;
row3 = dcval;
} else if (left_ac_bitmap == 0) {
/* AC coefficients are zero for columns 0, 1, 2, and 3.
* Compute DC values for these columns.
*/
int16x4_t dcval = vshl_n_s16(vget_low_s16(row0), PASS1_BITS);
/* Commence regular IDCT computation for columns 4, 5, 6, and 7. */
/* Load quantization table. */
int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
/* Even part */
int32x4_t tmp0 = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
int16x4_t z2 = vmul_s16(vget_high_s16(row2), quant_row2);
int16x4_t z3 = vmul_s16(vget_high_s16(row6), quant_row6);
int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0);
tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1);
int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
/* Odd part */
int16x4_t z1 = vmul_s16(vget_high_s16(row7), quant_row7);
z2 = vmul_s16(vget_high_s16(row5), quant_row5);
z3 = vmul_s16(vget_high_s16(row3), quant_row3);
int16x4_t z4 = vmul_s16(vget_high_s16(row1), quant_row1);
tmp0 = vmull_lane_s16(z1, consts.val[0], 2);
tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3);
tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0);
tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1);
tmp2 = vmull_lane_s16(z1, consts.val[1], 2);
tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3);
tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0);
tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1);
/* Final output stage: descale and narrow to 16-bit. */
row0 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp10, tmp2),
CONST_BITS - PASS1_BITS + 1));
row3 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp10, tmp2),
CONST_BITS - PASS1_BITS + 1));
row1 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp12, tmp0),
CONST_BITS - PASS1_BITS + 1));
row2 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp12, tmp0),
CONST_BITS - PASS1_BITS + 1));
} else if (right_ac_bitmap == 0) {
/* AC coefficients are zero for columns 4, 5, 6, and 7.
* Compute DC values for these columns.
*/
int16x4_t dcval = vshl_n_s16(vget_high_s16(row0), PASS1_BITS);
/* Commence regular IDCT computation for columns 0, 1, 2, and 3. */
/* Load quantization table. */
int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
/* Even part */
int32x4_t tmp0 = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
int16x4_t z2 = vmul_s16(vget_low_s16(row2), quant_row2);
int16x4_t z3 = vmul_s16(vget_low_s16(row6), quant_row6);
int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0);
tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1);
int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
/* Odd part */
int16x4_t z1 = vmul_s16(vget_low_s16(row7), quant_row7);
z2 = vmul_s16(vget_low_s16(row5), quant_row5);
z3 = vmul_s16(vget_low_s16(row3), quant_row3);
int16x4_t z4 = vmul_s16(vget_low_s16(row1), quant_row1);
tmp0 = vmull_lane_s16(z1, consts.val[0], 2);
tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3);
tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0);
tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1);
tmp2 = vmull_lane_s16(z1, consts.val[1], 2);
tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3);
tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0);
tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1);
/* Final output stage: descale and narrow to 16-bit. */
row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10, tmp2),
CONST_BITS - PASS1_BITS + 1), dcval);
row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10, tmp2),
CONST_BITS - PASS1_BITS + 1), dcval);
row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12, tmp0),
CONST_BITS - PASS1_BITS + 1), dcval);
row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12, tmp0),
CONST_BITS - PASS1_BITS + 1), dcval);
} else {
/* All AC coefficients are non-zero; full IDCT calculation required. */
int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
/* Even part */
int32x4_t tmp0_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
int32x4_t tmp0_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
int16x8_t z2 = vmulq_s16(row2, quant_row2);
int16x8_t z3 = vmulq_s16(row6, quant_row6);
int32x4_t tmp2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[0], 0);
int32x4_t tmp2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[0], 0);
tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[0], 1);
tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[0], 1);
int32x4_t tmp10_l = vaddq_s32(tmp0_l, tmp2_l);
int32x4_t tmp10_h = vaddq_s32(tmp0_h, tmp2_h);
int32x4_t tmp12_l = vsubq_s32(tmp0_l, tmp2_l);
int32x4_t tmp12_h = vsubq_s32(tmp0_h, tmp2_h);
/* Odd part */
int16x8_t z1 = vmulq_s16(row7, quant_row7);
z2 = vmulq_s16(row5, quant_row5);
z3 = vmulq_s16(row3, quant_row3);
int16x8_t z4 = vmulq_s16(row1, quant_row1);
tmp0_l = vmull_lane_s16(vget_low_s16(z1), consts.val[0], 2);
tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z2), consts.val[0], 3);
tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z3), consts.val[1], 0);
tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z4), consts.val[1], 1);
tmp0_h = vmull_lane_s16(vget_high_s16(z1), consts.val[0], 2);
tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z2), consts.val[0], 3);
tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z3), consts.val[1], 0);
tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z4), consts.val[1], 1);
tmp2_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 2);
tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z2), consts.val[1], 3);
tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[2], 0);
tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z4), consts.val[2], 1);
tmp2_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 2);
tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z2), consts.val[1], 3);
tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[2], 0);
tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z4), consts.val[2], 1);
/* Final output stage: descale and narrow to 16-bit. */
row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp2_l),
CONST_BITS - PASS1_BITS + 1),
vrshrn_n_s32(vaddq_s32(tmp10_h, tmp2_h),
CONST_BITS - PASS1_BITS + 1));
row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp2_l),
CONST_BITS - PASS1_BITS + 1),
vrshrn_n_s32(vsubq_s32(tmp10_h, tmp2_h),
CONST_BITS - PASS1_BITS + 1));
row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12_l, tmp0_l),
CONST_BITS - PASS1_BITS + 1),
vrshrn_n_s32(vaddq_s32(tmp12_h, tmp0_h),
CONST_BITS - PASS1_BITS + 1));
row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12_l, tmp0_l),
CONST_BITS - PASS1_BITS + 1),
vrshrn_n_s32(vsubq_s32(tmp12_h, tmp0_h),
CONST_BITS - PASS1_BITS + 1));
}
/* Transpose 8x4 block to perform IDCT on rows in second pass. */
int16x8x2_t row_01 = vtrnq_s16(row0, row1);
int16x8x2_t row_23 = vtrnq_s16(row2, row3);
int32x4x2_t cols_0426 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[0]),
vreinterpretq_s32_s16(row_23.val[0]));
int32x4x2_t cols_1537 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[1]),
vreinterpretq_s32_s16(row_23.val[1]));
int16x4_t col0 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[0]));
int16x4_t col1 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[0]));
int16x4_t col2 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[1]));
int16x4_t col3 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[1]));
int16x4_t col5 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[0]));
int16x4_t col6 = vreinterpret_s16_s32(vget_high_s32(cols_0426.val[1]));
int16x4_t col7 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[1]));
/* Commence second pass of IDCT. */
/* Even part */
int32x4_t tmp0 = vshll_n_s16(col0, CONST_BITS + 1);
int32x4_t tmp2 = vmull_lane_s16(col2, consts.val[0], 0);
tmp2 = vmlal_lane_s16(tmp2, col6, consts.val[0], 1);
int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
/* Odd part */
tmp0 = vmull_lane_s16(col7, consts.val[0], 2);
tmp0 = vmlal_lane_s16(tmp0, col5, consts.val[0], 3);
tmp0 = vmlal_lane_s16(tmp0, col3, consts.val[1], 0);
tmp0 = vmlal_lane_s16(tmp0, col1, consts.val[1], 1);
tmp2 = vmull_lane_s16(col7, consts.val[1], 2);
tmp2 = vmlal_lane_s16(tmp2, col5, consts.val[1], 3);
tmp2 = vmlal_lane_s16(tmp2, col3, consts.val[2], 0);
tmp2 = vmlal_lane_s16(tmp2, col1, consts.val[2], 1);
/* Final output stage: descale and clamp to range [0-255]. */
int16x8_t output_cols_02 = vcombine_s16(vaddhn_s32(tmp10, tmp2),
vsubhn_s32(tmp12, tmp0));
int16x8_t output_cols_13 = vcombine_s16(vaddhn_s32(tmp12, tmp0),
vsubhn_s32(tmp10, tmp2));
output_cols_02 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_02,
CONST_BITS + PASS1_BITS + 3 + 1 - 16);
output_cols_13 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_13,
CONST_BITS + PASS1_BITS + 3 + 1 - 16);
/* Narrow to 8-bit and convert to unsigned while zipping 8-bit elements.
* An interleaving store completes the transpose.
*/
uint8x8x2_t output_0123 = vzip_u8(vqmovun_s16(output_cols_02),
vqmovun_s16(output_cols_13));
uint16x4x2_t output_01_23 = { {
vreinterpret_u16_u8(output_0123.val[0]),
vreinterpret_u16_u8(output_0123.val[1])
} };
/* Store 4x4 block to memory. */
JSAMPROW outptr0 = output_buf[0] + output_col;
JSAMPROW outptr1 = output_buf[1] + output_col;
JSAMPROW outptr2 = output_buf[2] + output_col;
JSAMPROW outptr3 = output_buf[3] + output_col;
vst2_lane_u16((uint16_t *)outptr0, output_01_23, 0);
vst2_lane_u16((uint16_t *)outptr1, output_01_23, 1);
vst2_lane_u16((uint16_t *)outptr2, output_01_23, 2);
vst2_lane_u16((uint16_t *)outptr3, output_01_23, 3);
}

@ -0,0 +1,193 @@
/*
* jquanti-neon.c - sample data conversion and quantization (Arm Neon)
*
* Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#define JPEG_INTERNALS
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
#include <arm_neon.h>
/* After downsampling, the resulting sample values are in the range [0, 255],
* but the Discrete Cosine Transform (DCT) operates on values centered around
* 0.
*
* To prepare sample values for the DCT, load samples into a DCT workspace,
* subtracting CENTERJSAMPLE (128). The samples, now in the range [-128, 127],
* are also widened from 8- to 16-bit.
*
* The equivalent scalar C function convsamp() can be found in jcdctmgr.c.
*/
void jsimd_convsamp_neon(JSAMPARRAY sample_data, JDIMENSION start_col,
DCTELEM *workspace)
{
uint8x8_t samp_row0 = vld1_u8(sample_data[0] + start_col);
uint8x8_t samp_row1 = vld1_u8(sample_data[1] + start_col);
uint8x8_t samp_row2 = vld1_u8(sample_data[2] + start_col);
uint8x8_t samp_row3 = vld1_u8(sample_data[3] + start_col);
uint8x8_t samp_row4 = vld1_u8(sample_data[4] + start_col);
uint8x8_t samp_row5 = vld1_u8(sample_data[5] + start_col);
uint8x8_t samp_row6 = vld1_u8(sample_data[6] + start_col);
uint8x8_t samp_row7 = vld1_u8(sample_data[7] + start_col);
int16x8_t row0 =
vreinterpretq_s16_u16(vsubl_u8(samp_row0, vdup_n_u8(CENTERJSAMPLE)));
int16x8_t row1 =
vreinterpretq_s16_u16(vsubl_u8(samp_row1, vdup_n_u8(CENTERJSAMPLE)));
int16x8_t row2 =
vreinterpretq_s16_u16(vsubl_u8(samp_row2, vdup_n_u8(CENTERJSAMPLE)));
int16x8_t row3 =
vreinterpretq_s16_u16(vsubl_u8(samp_row3, vdup_n_u8(CENTERJSAMPLE)));
int16x8_t row4 =
vreinterpretq_s16_u16(vsubl_u8(samp_row4, vdup_n_u8(CENTERJSAMPLE)));
int16x8_t row5 =
vreinterpretq_s16_u16(vsubl_u8(samp_row5, vdup_n_u8(CENTERJSAMPLE)));
int16x8_t row6 =
vreinterpretq_s16_u16(vsubl_u8(samp_row6, vdup_n_u8(CENTERJSAMPLE)));
int16x8_t row7 =
vreinterpretq_s16_u16(vsubl_u8(samp_row7, vdup_n_u8(CENTERJSAMPLE)));
vst1q_s16(workspace + 0 * DCTSIZE, row0);
vst1q_s16(workspace + 1 * DCTSIZE, row1);
vst1q_s16(workspace + 2 * DCTSIZE, row2);
vst1q_s16(workspace + 3 * DCTSIZE, row3);
vst1q_s16(workspace + 4 * DCTSIZE, row4);
vst1q_s16(workspace + 5 * DCTSIZE, row5);
vst1q_s16(workspace + 6 * DCTSIZE, row6);
vst1q_s16(workspace + 7 * DCTSIZE, row7);
}
/* After the DCT, the resulting array of coefficient values needs to be divided
* by an array of quantization values.
*
* To avoid a slow division operation, the DCT coefficients are multiplied by
* the (scaled) reciprocals of the quantization values and then right-shifted.
*
* The equivalent scalar C function quantize() can be found in jcdctmgr.c.
*/
void jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
DCTELEM *workspace)
{
JCOEFPTR out_ptr = coef_block;
UDCTELEM *recip_ptr = (UDCTELEM *)divisors;
UDCTELEM *corr_ptr = (UDCTELEM *)divisors + DCTSIZE2;
DCTELEM *shift_ptr = divisors + 3 * DCTSIZE2;
int i;
#if defined(__clang__) && (defined(__aarch64__) || defined(_M_ARM64))
#pragma unroll
#endif
for (i = 0; i < DCTSIZE; i += DCTSIZE / 2) {
/* Load DCT coefficients. */
int16x8_t row0 = vld1q_s16(workspace + (i + 0) * DCTSIZE);
int16x8_t row1 = vld1q_s16(workspace + (i + 1) * DCTSIZE);
int16x8_t row2 = vld1q_s16(workspace + (i + 2) * DCTSIZE);
int16x8_t row3 = vld1q_s16(workspace + (i + 3) * DCTSIZE);
/* Load reciprocals of quantization values. */
uint16x8_t recip0 = vld1q_u16(recip_ptr + (i + 0) * DCTSIZE);
uint16x8_t recip1 = vld1q_u16(recip_ptr + (i + 1) * DCTSIZE);
uint16x8_t recip2 = vld1q_u16(recip_ptr + (i + 2) * DCTSIZE);
uint16x8_t recip3 = vld1q_u16(recip_ptr + (i + 3) * DCTSIZE);
uint16x8_t corr0 = vld1q_u16(corr_ptr + (i + 0) * DCTSIZE);
uint16x8_t corr1 = vld1q_u16(corr_ptr + (i + 1) * DCTSIZE);
uint16x8_t corr2 = vld1q_u16(corr_ptr + (i + 2) * DCTSIZE);
uint16x8_t corr3 = vld1q_u16(corr_ptr + (i + 3) * DCTSIZE);
int16x8_t shift0 = vld1q_s16(shift_ptr + (i + 0) * DCTSIZE);
int16x8_t shift1 = vld1q_s16(shift_ptr + (i + 1) * DCTSIZE);
int16x8_t shift2 = vld1q_s16(shift_ptr + (i + 2) * DCTSIZE);
int16x8_t shift3 = vld1q_s16(shift_ptr + (i + 3) * DCTSIZE);
/* Extract sign from coefficients. */
int16x8_t sign_row0 = vshrq_n_s16(row0, 15);
int16x8_t sign_row1 = vshrq_n_s16(row1, 15);
int16x8_t sign_row2 = vshrq_n_s16(row2, 15);
int16x8_t sign_row3 = vshrq_n_s16(row3, 15);
/* Get absolute value of DCT coefficients. */
uint16x8_t abs_row0 = vreinterpretq_u16_s16(vabsq_s16(row0));
uint16x8_t abs_row1 = vreinterpretq_u16_s16(vabsq_s16(row1));
uint16x8_t abs_row2 = vreinterpretq_u16_s16(vabsq_s16(row2));
uint16x8_t abs_row3 = vreinterpretq_u16_s16(vabsq_s16(row3));
/* Add correction. */
abs_row0 = vaddq_u16(abs_row0, corr0);
abs_row1 = vaddq_u16(abs_row1, corr1);
abs_row2 = vaddq_u16(abs_row2, corr2);
abs_row3 = vaddq_u16(abs_row3, corr3);
/* Multiply DCT coefficients by quantization reciprocals. */
int32x4_t row0_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row0),
vget_low_u16(recip0)));
int32x4_t row0_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row0),
vget_high_u16(recip0)));
int32x4_t row1_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row1),
vget_low_u16(recip1)));
int32x4_t row1_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row1),
vget_high_u16(recip1)));
int32x4_t row2_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row2),
vget_low_u16(recip2)));
int32x4_t row2_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row2),
vget_high_u16(recip2)));
int32x4_t row3_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row3),
vget_low_u16(recip3)));
int32x4_t row3_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row3),
vget_high_u16(recip3)));
/* Narrow back to 16-bit. */
row0 = vcombine_s16(vshrn_n_s32(row0_l, 16), vshrn_n_s32(row0_h, 16));
row1 = vcombine_s16(vshrn_n_s32(row1_l, 16), vshrn_n_s32(row1_h, 16));
row2 = vcombine_s16(vshrn_n_s32(row2_l, 16), vshrn_n_s32(row2_h, 16));
row3 = vcombine_s16(vshrn_n_s32(row3_l, 16), vshrn_n_s32(row3_h, 16));
/* Since VSHR only supports an immediate as its second argument, negate the
* shift value and shift left.
*/
row0 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row0),
vnegq_s16(shift0)));
row1 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row1),
vnegq_s16(shift1)));
row2 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row2),
vnegq_s16(shift2)));
row3 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row3),
vnegq_s16(shift3)));
/* Restore sign to original product. */
row0 = veorq_s16(row0, sign_row0);
row0 = vsubq_s16(row0, sign_row0);
row1 = veorq_s16(row1, sign_row1);
row1 = vsubq_s16(row1, sign_row1);
row2 = veorq_s16(row2, sign_row2);
row2 = vsubq_s16(row2, sign_row2);
row3 = veorq_s16(row3, sign_row3);
row3 = vsubq_s16(row3, sign_row3);
/* Store quantized coefficients to memory. */
vst1q_s16(out_ptr + (i + 0) * DCTSIZE, row0);
vst1q_s16(out_ptr + (i + 1) * DCTSIZE, row1);
vst1q_s16(out_ptr + (i + 2) * DCTSIZE, row2);
vst1q_s16(out_ptr + (i + 3) * DCTSIZE, row3);
}
}

@ -0,0 +1,37 @@
/*
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
* Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#cmakedefine HAVE_VLD1_S16_X3
#cmakedefine HAVE_VLD1_U16_X2
#cmakedefine HAVE_VLD1Q_U8_X4
/* Define compiler-independent count-leading-zeros and byte-swap macros */
#if defined(_MSC_VER) && !defined(__clang__)
#define BUILTIN_CLZ(x) _CountLeadingZeros(x)
#define BUILTIN_CLZLL(x) _CountLeadingZeros64(x)
#define BUILTIN_BSWAP64(x) _byteswap_uint64(x)
#elif defined(__clang__) || defined(__GNUC__)
#define BUILTIN_CLZ(x) __builtin_clz(x)
#define BUILTIN_CLZLL(x) __builtin_clzll(x)
#define BUILTIN_BSWAP64(x) __builtin_bswap64(x)
#else
#error "Unknown compiler"
#endif

@ -0,0 +1,578 @@
;
; jccolext.asm - colorspace conversion (AVX2)
;
; Copyright (C) 2015, Intel Corporation.
; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
%include "jcolsamp.inc"
; --------------------------------------------------------------------------
;
; Convert some rows of samples to the output colorspace.
;
; GLOBAL(void)
; jsimd_rgb_ycc_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
; JSAMPIMAGE output_buf, JDIMENSION output_row,
; int num_rows);
;
%define img_width(b) (b) + 8 ; JDIMENSION img_width
%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf
%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf
%define output_row(b) (b) + 20 ; JDIMENSION output_row
%define num_rows(b) (b) + 24 ; int num_rows
%define original_ebp ebp + 0
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
; ymmword wk[WK_NUM]
%define WK_NUM 8
%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
align 32
GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_avx2)
EXTN(jsimd_rgb_ycc_convert_avx2):
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic eax ; make a room for GOT address
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
get_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [img_width(eax)]
test ecx, ecx
jz near .return
push ecx
mov esi, JSAMPIMAGE [output_buf(eax)]
mov ecx, JDIMENSION [output_row(eax)]
mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
lea edi, [edi+ecx*SIZEOF_JSAMPROW]
lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
lea edx, [edx+ecx*SIZEOF_JSAMPROW]
pop ecx
mov esi, JSAMPARRAY [input_buf(eax)]
mov eax, INT [num_rows(eax)]
test eax, eax
jle near .return
alignx 16, 7
.rowloop:
pushpic eax
push edx
push ebx
push edi
push esi
push ecx ; col
mov esi, JSAMPROW [esi] ; inptr
mov edi, JSAMPROW [edi] ; outptr0
mov ebx, JSAMPROW [ebx] ; outptr1
mov edx, JSAMPROW [edx] ; outptr2
movpic eax, POINTER [gotptr] ; load GOT address (eax)
cmp ecx, byte SIZEOF_YMMWORD
jae near .columnloop
alignx 16, 7
%if RGB_PIXELSIZE == 3 ; ---------------
.column_ld1:
push eax
push edx
lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
test cl, SIZEOF_BYTE
jz short .column_ld2
sub ecx, byte SIZEOF_BYTE
movzx eax, byte [esi+ecx]
.column_ld2:
test cl, SIZEOF_WORD
jz short .column_ld4
sub ecx, byte SIZEOF_WORD
movzx edx, word [esi+ecx]
shl eax, WORD_BIT
or eax, edx
.column_ld4:
vmovd xmmA, eax
pop edx
pop eax
test cl, SIZEOF_DWORD
jz short .column_ld8
sub ecx, byte SIZEOF_DWORD
vmovd xmmF, XMM_DWORD [esi+ecx]
vpslldq xmmA, xmmA, SIZEOF_DWORD
vpor xmmA, xmmA, xmmF
.column_ld8:
test cl, SIZEOF_MMWORD
jz short .column_ld16
sub ecx, byte SIZEOF_MMWORD
vmovq xmmB, XMM_MMWORD [esi+ecx]
vpslldq xmmA, xmmA, SIZEOF_MMWORD
vpor xmmA, xmmA, xmmB
.column_ld16:
test cl, SIZEOF_XMMWORD
jz short .column_ld32
sub ecx, byte SIZEOF_XMMWORD
vmovdqu xmmB, XMM_MMWORD [esi+ecx]
vperm2i128 ymmA, ymmA, ymmA, 1
vpor ymmA, ymmB
.column_ld32:
test cl, SIZEOF_YMMWORD
jz short .column_ld64
sub ecx, byte SIZEOF_YMMWORD
vmovdqa ymmF, ymmA
vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
.column_ld64:
test cl, 2*SIZEOF_YMMWORD
mov ecx, SIZEOF_YMMWORD
jz short .rgb_ycc_cnv
vmovdqa ymmB, ymmA
vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
jmp short .rgb_ycc_cnv
alignx 16, 7
.columnloop:
vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
vmovdqu ymmB, YMMWORD [esi+2*SIZEOF_YMMWORD]
.rgb_ycc_cnv:
; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
vmovdqu ymmC, ymmA
vinserti128 ymmA, ymmF, xmmA, 0 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
vinserti128 ymmC, ymmC, xmmB, 0 ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
vinserti128 ymmB, ymmB, xmmF, 0 ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
vperm2i128 ymmF, ymmC, ymmC, 1 ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
vmovdqa ymmG, ymmA
vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
; 22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
vpsrldq ymmG, ymmG, 8 ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
; 2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
vpunpckhbw ymmA, ymmA, ymmF ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
; 0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
vpslldq ymmF, ymmF, 8 ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
; 08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
vpunpcklbw ymmG, ymmG, ymmB ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
; 2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
vpunpckhbw ymmF, ymmF, ymmB ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
; 1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
vmovdqa ymmD, ymmA
vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
; 11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
vpsrldq ymmD, ymmD, 8 ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
; 1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
vpunpckhbw ymmA, ymmA, ymmG ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
vpslldq ymmG, ymmG, 8 ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
; 04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
; 1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
; 2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
vmovdqa ymmE, ymmA
vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
; 20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
vpsrldq ymmE, ymmE, 8 ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
; 2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
vpunpckhbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
vpslldq ymmD, ymmD, 8 ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
; 02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
; 2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
vpunpckhbw ymmD, ymmD, ymmG ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
; 1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
vpxor ymmH, ymmH, ymmH
vmovdqa ymmC, ymmA
vpunpcklbw ymmA, ymmA, ymmH ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
vmovdqa ymmB, ymmE
vpunpcklbw ymmE, ymmE, ymmH ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
vpunpckhbw ymmB, ymmB, ymmH ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
vmovdqa ymmF, ymmD
vpunpcklbw ymmD, ymmD, ymmH ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
vpunpckhbw ymmF, ymmF, ymmH ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
%else ; RGB_PIXELSIZE == 4 ; -----------
.column_ld1:
test cl, SIZEOF_XMMWORD/16
jz short .column_ld2
sub ecx, byte SIZEOF_XMMWORD/16
vmovd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
.column_ld2:
test cl, SIZEOF_XMMWORD/8
jz short .column_ld4
sub ecx, byte SIZEOF_XMMWORD/8
vmovq xmmF, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
vpslldq xmmA, xmmA, SIZEOF_MMWORD
vpor xmmA, xmmA, xmmF
.column_ld4:
test cl, SIZEOF_XMMWORD/4
jz short .column_ld8
sub ecx, byte SIZEOF_XMMWORD/4
vmovdqa xmmF, xmmA
vperm2i128 ymmF, ymmF, ymmF, 1
vmovdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
vpor ymmA, ymmA, ymmF
.column_ld8:
test cl, SIZEOF_XMMWORD/2
jz short .column_ld16
sub ecx, byte SIZEOF_XMMWORD/2
vmovdqa ymmF, ymmA
vmovdqu ymmA, YMMWORD [esi+ecx*RGB_PIXELSIZE]
.column_ld16:
test cl, SIZEOF_XMMWORD
mov ecx, SIZEOF_YMMWORD
jz short .rgb_ycc_cnv
vmovdqa ymmE, ymmA
vmovdqa ymmH, ymmF
vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
jmp short .rgb_ycc_cnv
alignx 16, 7
.columnloop:
vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
vmovdqu ymmE, YMMWORD [esi+2*SIZEOF_YMMWORD]
vmovdqu ymmH, YMMWORD [esi+3*SIZEOF_YMMWORD]
.rgb_ycc_cnv:
; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
vmovdqa ymmB, ymmA
vinserti128 ymmA, ymmA, xmmE, 1 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
vperm2i128 ymmE, ymmB, ymmE, 0x31 ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
vmovdqa ymmB, ymmF
vinserti128 ymmF, ymmF, xmmH, 1 ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
vperm2i128 ymmH, ymmB, ymmH, 0x31 ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
vmovdqa ymmD, ymmA
vpunpcklbw ymmA, ymmA, ymmE ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
; 0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
vpunpckhbw ymmD, ymmD, ymmE ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
; 0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
vmovdqa ymmC, ymmF
vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
; 0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
; 0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
vmovdqa ymmB, ymmA
vpunpcklwd ymmA, ymmA, ymmF ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
vpunpckhwd ymmB, ymmB, ymmF ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
; 0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
vmovdqa ymmG, ymmD
vpunpcklwd ymmD, ymmD, ymmC ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
; 0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
vpunpckhwd ymmG, ymmG, ymmC ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
; 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
vmovdqa ymmE, ymmA
vpunpcklbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
vpunpckhbw ymmE, ymmE, ymmD ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
; 2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
vmovdqa ymmH, ymmB
vpunpcklbw ymmB, ymmB, ymmG ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
; 0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
vpunpckhbw ymmH, ymmH, ymmG ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
; 2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
vpxor ymmF, ymmF, ymmF
vmovdqa ymmC, ymmA
vpunpcklbw ymmA, ymmA, ymmF ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
vpunpckhbw ymmC, ymmC, ymmF ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
vmovdqa ymmD, ymmB
vpunpcklbw ymmB, ymmB, ymmF ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
vpunpckhbw ymmD, ymmD, ymmF ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
vmovdqa ymmG, ymmE
vpunpcklbw ymmE, ymmE, ymmF ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
vpunpcklbw ymmF, ymmF, ymmH
vpunpckhbw ymmH, ymmH, ymmH
vpsrlw ymmF, ymmF, BYTE_BIT ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
vpsrlw ymmH, ymmH, BYTE_BIT ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
%endif ; RGB_PIXELSIZE ; ---------------
; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
; (Original)
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
;
; (This implementation)
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
vmovdqa YMMWORD [wk(0)], ymm0 ; wk(0)=RE
vmovdqa YMMWORD [wk(1)], ymm1 ; wk(1)=RO
vmovdqa YMMWORD [wk(2)], ymm4 ; wk(2)=BE
vmovdqa YMMWORD [wk(3)], ymm5 ; wk(3)=BO
vmovdqa ymm6, ymm1
vpunpcklwd ymm1, ymm1, ymm3
vpunpckhwd ymm6, ymm6, ymm3
vmovdqa ymm7, ymm1
vmovdqa ymm4, ymm6
vpmaddwd ymm1, ymm1, [GOTOFF(eax,PW_F0299_F0337)] ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)] ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
vpmaddwd ymm7, ymm7, [GOTOFF(eax,PW_MF016_MF033)] ; ymm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_MF016_MF033)] ; ymm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
vmovdqa YMMWORD [wk(4)], ymm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
vmovdqa YMMWORD [wk(5)], ymm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
vpxor ymm1, ymm1, ymm1
vpxor ymm6, ymm6, ymm6
vpunpcklwd ymm1, ymm1, ymm5 ; ymm1=BOL
vpunpckhwd ymm6, ymm6, ymm5 ; ymm6=BOH
vpsrld ymm1, ymm1, 1 ; ymm1=BOL*FIX(0.500)
vpsrld ymm6, ymm6, 1 ; ymm6=BOH*FIX(0.500)
vmovdqa ymm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; ymm5=[PD_ONEHALFM1_CJ]
vpaddd ymm7, ymm7, ymm1
vpaddd ymm4, ymm4, ymm6
vpaddd ymm7, ymm7, ymm5
vpaddd ymm4, ymm4, ymm5
vpsrld ymm7, ymm7, SCALEBITS ; ymm7=CbOL
vpsrld ymm4, ymm4, SCALEBITS ; ymm4=CbOH
vpackssdw ymm7, ymm7, ymm4 ; ymm7=CbO
vmovdqa ymm1, YMMWORD [wk(2)] ; ymm1=BE
vmovdqa ymm6, ymm0
vpunpcklwd ymm0, ymm0, ymm2
vpunpckhwd ymm6, ymm6, ymm2
vmovdqa ymm5, ymm0
vmovdqa ymm4, ymm6
vpmaddwd ymm0, ymm0, [GOTOFF(eax,PW_F0299_F0337)] ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)] ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
vpmaddwd ymm5, ymm5, [GOTOFF(eax,PW_MF016_MF033)] ; ymm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_MF016_MF033)] ; ymm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
vmovdqa YMMWORD [wk(6)], ymm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
vmovdqa YMMWORD [wk(7)], ymm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
vpxor ymm0, ymm0, ymm0
vpxor ymm6, ymm6, ymm6
vpunpcklwd ymm0, ymm0, ymm1 ; ymm0=BEL
vpunpckhwd ymm6, ymm6, ymm1 ; ymm6=BEH
vpsrld ymm0, ymm0, 1 ; ymm0=BEL*FIX(0.500)
vpsrld ymm6, ymm6, 1 ; ymm6=BEH*FIX(0.500)
vmovdqa ymm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; ymm1=[PD_ONEHALFM1_CJ]
vpaddd ymm5, ymm5, ymm0
vpaddd ymm4, ymm4, ymm6
vpaddd ymm5, ymm5, ymm1
vpaddd ymm4, ymm4, ymm1
vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CbEL
vpsrld ymm4, ymm4, SCALEBITS ; ymm4=CbEH
vpackssdw ymm5, ymm5, ymm4 ; ymm5=CbE
vpsllw ymm7, ymm7, BYTE_BIT
vpor ymm5, ymm5, ymm7 ; ymm5=Cb
vmovdqu YMMWORD [ebx], ymm5 ; Save Cb
vmovdqa ymm0, YMMWORD [wk(3)] ; ymm0=BO
vmovdqa ymm6, YMMWORD [wk(2)] ; ymm6=BE
vmovdqa ymm1, YMMWORD [wk(1)] ; ymm1=RO
vmovdqa ymm4, ymm0
vpunpcklwd ymm0, ymm0, ymm3
vpunpckhwd ymm4, ymm4, ymm3
vmovdqa ymm7, ymm0
vmovdqa ymm5, ymm4
vpmaddwd ymm0, ymm0, [GOTOFF(eax,PW_F0114_F0250)] ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)] ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
vpmaddwd ymm7, ymm7, [GOTOFF(eax,PW_MF008_MF041)] ; ymm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
vpmaddwd ymm5, ymm5, [GOTOFF(eax,PW_MF008_MF041)] ; ymm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
vmovdqa ymm3, [GOTOFF(eax,PD_ONEHALF)] ; ymm3=[PD_ONEHALF]
vpaddd ymm0, ymm0, YMMWORD [wk(4)]
vpaddd ymm4, ymm4, YMMWORD [wk(5)]
vpaddd ymm0, ymm0, ymm3
vpaddd ymm4, ymm4, ymm3
vpsrld ymm0, ymm0, SCALEBITS ; ymm0=YOL
vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YOH
vpackssdw ymm0, ymm0, ymm4 ; ymm0=YO
vpxor ymm3, ymm3, ymm3
vpxor ymm4, ymm4, ymm4
vpunpcklwd ymm3, ymm3, ymm1 ; ymm3=ROL
vpunpckhwd ymm4, ymm4, ymm1 ; ymm4=ROH
vpsrld ymm3, ymm3, 1 ; ymm3=ROL*FIX(0.500)
vpsrld ymm4, ymm4, 1 ; ymm4=ROH*FIX(0.500)
vmovdqa ymm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; ymm1=[PD_ONEHALFM1_CJ]
vpaddd ymm7, ymm7, ymm3
vpaddd ymm5, ymm5, ymm4
vpaddd ymm7, ymm7, ymm1
vpaddd ymm5, ymm5, ymm1
vpsrld ymm7, ymm7, SCALEBITS ; ymm7=CrOL
vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CrOH
vpackssdw ymm7, ymm7, ymm5 ; ymm7=CrO
vmovdqa ymm3, YMMWORD [wk(0)] ; ymm3=RE
vmovdqa ymm4, ymm6
vpunpcklwd ymm6, ymm6, ymm2
vpunpckhwd ymm4, ymm4, ymm2
vmovdqa ymm1, ymm6
vmovdqa ymm5, ymm4
vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0114_F0250)] ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)] ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
vpmaddwd ymm1, ymm1, [GOTOFF(eax,PW_MF008_MF041)] ; ymm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
vpmaddwd ymm5, ymm5, [GOTOFF(eax,PW_MF008_MF041)] ; ymm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
vmovdqa ymm2, [GOTOFF(eax,PD_ONEHALF)] ; ymm2=[PD_ONEHALF]
vpaddd ymm6, ymm6, YMMWORD [wk(6)]
vpaddd ymm4, ymm4, YMMWORD [wk(7)]
vpaddd ymm6, ymm6, ymm2
vpaddd ymm4, ymm4, ymm2
vpsrld ymm6, ymm6, SCALEBITS ; ymm6=YEL
vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YEH
vpackssdw ymm6, ymm6, ymm4 ; ymm6=YE
vpsllw ymm0, ymm0, BYTE_BIT
vpor ymm6, ymm6, ymm0 ; ymm6=Y
vmovdqu YMMWORD [edi], ymm6 ; Save Y
vpxor ymm2, ymm2, ymm2
vpxor ymm4, ymm4, ymm4
vpunpcklwd ymm2, ymm2, ymm3 ; ymm2=REL
vpunpckhwd ymm4, ymm4, ymm3 ; ymm4=REH
vpsrld ymm2, ymm2, 1 ; ymm2=REL*FIX(0.500)
vpsrld ymm4, ymm4, 1 ; ymm4=REH*FIX(0.500)
vmovdqa ymm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; ymm0=[PD_ONEHALFM1_CJ]
vpaddd ymm1, ymm1, ymm2
vpaddd ymm5, ymm5, ymm4
vpaddd ymm1, ymm1, ymm0
vpaddd ymm5, ymm5, ymm0
vpsrld ymm1, ymm1, SCALEBITS ; ymm1=CrEL
vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CrEH
vpackssdw ymm1, ymm1, ymm5 ; ymm1=CrE
vpsllw ymm7, ymm7, BYTE_BIT
vpor ymm1, ymm1, ymm7 ; ymm1=Cr
vmovdqu YMMWORD [edx], ymm1 ; Save Cr
sub ecx, byte SIZEOF_YMMWORD
add esi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; inptr
add edi, byte SIZEOF_YMMWORD ; outptr0
add ebx, byte SIZEOF_YMMWORD ; outptr1
add edx, byte SIZEOF_YMMWORD ; outptr2
cmp ecx, byte SIZEOF_YMMWORD
jae near .columnloop
test ecx, ecx
jnz near .column_ld1
pop ecx ; col
pop esi
pop edi
pop ebx
pop edx
poppic eax
add esi, byte SIZEOF_JSAMPROW ; input_buf
add edi, byte SIZEOF_JSAMPROW
add ebx, byte SIZEOF_JSAMPROW
add edx, byte SIZEOF_JSAMPROW
dec eax ; num_rows
jg near .rowloop
.return:
vzeroupper
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 32

@ -0,0 +1,476 @@
;
; jccolext.asm - colorspace conversion (MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
%include "jcolsamp.inc"
; --------------------------------------------------------------------------
;
; Convert some rows of samples to the output colorspace.
;
; GLOBAL(void)
; jsimd_rgb_ycc_convert_mmx(JDIMENSION img_width, JSAMPARRAY input_buf,
; JSAMPIMAGE output_buf, JDIMENSION output_row,
; int num_rows);
;
%define img_width(b) (b) + 8 ; JDIMENSION img_width
%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf
%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf
%define output_row(b) (b) + 20 ; JDIMENSION output_row
%define num_rows(b) (b) + 24 ; int num_rows
%define original_ebp ebp + 0
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
; mmword wk[WK_NUM]
%define WK_NUM 8
%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
align 32
GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_mmx)
EXTN(jsimd_rgb_ycc_convert_mmx):
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic eax ; make a room for GOT address
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
get_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [img_width(eax)] ; num_cols
test ecx, ecx
jz near .return
push ecx
mov esi, JSAMPIMAGE [output_buf(eax)]
mov ecx, JDIMENSION [output_row(eax)]
mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
lea edi, [edi+ecx*SIZEOF_JSAMPROW]
lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
lea edx, [edx+ecx*SIZEOF_JSAMPROW]
pop ecx
mov esi, JSAMPARRAY [input_buf(eax)]
mov eax, INT [num_rows(eax)]
test eax, eax
jle near .return
alignx 16, 7
.rowloop:
pushpic eax
push edx
push ebx
push edi
push esi
push ecx ; col
mov esi, JSAMPROW [esi] ; inptr
mov edi, JSAMPROW [edi] ; outptr0
mov ebx, JSAMPROW [ebx] ; outptr1
mov edx, JSAMPROW [edx] ; outptr2
movpic eax, POINTER [gotptr] ; load GOT address (eax)
cmp ecx, byte SIZEOF_MMWORD
jae short .columnloop
alignx 16, 7
%if RGB_PIXELSIZE == 3 ; ---------------
.column_ld1:
push eax
push edx
lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
test cl, SIZEOF_BYTE
jz short .column_ld2
sub ecx, byte SIZEOF_BYTE
xor eax, eax
mov al, byte [esi+ecx]
.column_ld2:
test cl, SIZEOF_WORD
jz short .column_ld4
sub ecx, byte SIZEOF_WORD
xor edx, edx
mov dx, word [esi+ecx]
shl eax, WORD_BIT
or eax, edx
.column_ld4:
movd mmA, eax
pop edx
pop eax
test cl, SIZEOF_DWORD
jz short .column_ld8
sub ecx, byte SIZEOF_DWORD
movd mmG, dword [esi+ecx]
psllq mmA, DWORD_BIT
por mmA, mmG
.column_ld8:
test cl, SIZEOF_MMWORD
jz short .column_ld16
movq mmG, mmA
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
mov ecx, SIZEOF_MMWORD
jmp short .rgb_ycc_cnv
.column_ld16:
test cl, 2*SIZEOF_MMWORD
mov ecx, SIZEOF_MMWORD
jz short .rgb_ycc_cnv
movq mmF, mmA
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
jmp short .rgb_ycc_cnv
alignx 16, 7
.columnloop:
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
movq mmF, MMWORD [esi+2*SIZEOF_MMWORD]
.rgb_ycc_cnv:
; mmA=(00 10 20 01 11 21 02 12)
; mmG=(22 03 13 23 04 14 24 05)
; mmF=(15 25 06 16 26 07 17 27)
movq mmD, mmA
psllq mmA, 4*BYTE_BIT ; mmA=(-- -- -- -- 00 10 20 01)
psrlq mmD, 4*BYTE_BIT ; mmD=(11 21 02 12 -- -- -- --)
punpckhbw mmA, mmG ; mmA=(00 04 10 14 20 24 01 05)
psllq mmG, 4*BYTE_BIT ; mmG=(-- -- -- -- 22 03 13 23)
punpcklbw mmD, mmF ; mmD=(11 15 21 25 02 06 12 16)
punpckhbw mmG, mmF ; mmG=(22 26 03 07 13 17 23 27)
movq mmE, mmA
psllq mmA, 4*BYTE_BIT ; mmA=(-- -- -- -- 00 04 10 14)
psrlq mmE, 4*BYTE_BIT ; mmE=(20 24 01 05 -- -- -- --)
punpckhbw mmA, mmD ; mmA=(00 02 04 06 10 12 14 16)
psllq mmD, 4*BYTE_BIT ; mmD=(-- -- -- -- 11 15 21 25)
punpcklbw mmE, mmG ; mmE=(20 22 24 26 01 03 05 07)
punpckhbw mmD, mmG ; mmD=(11 13 15 17 21 23 25 27)
pxor mmH, mmH
movq mmC, mmA
punpcklbw mmA, mmH ; mmA=(00 02 04 06)
punpckhbw mmC, mmH ; mmC=(10 12 14 16)
movq mmB, mmE
punpcklbw mmE, mmH ; mmE=(20 22 24 26)
punpckhbw mmB, mmH ; mmB=(01 03 05 07)
movq mmF, mmD
punpcklbw mmD, mmH ; mmD=(11 13 15 17)
punpckhbw mmF, mmH ; mmF=(21 23 25 27)
%else ; RGB_PIXELSIZE == 4 ; -----------
.column_ld1:
test cl, SIZEOF_MMWORD/8
jz short .column_ld2
sub ecx, byte SIZEOF_MMWORD/8
movd mmA, dword [esi+ecx*RGB_PIXELSIZE]
.column_ld2:
test cl, SIZEOF_MMWORD/4
jz short .column_ld4
sub ecx, byte SIZEOF_MMWORD/4
movq mmF, mmA
movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
.column_ld4:
test cl, SIZEOF_MMWORD/2
mov ecx, SIZEOF_MMWORD
jz short .rgb_ycc_cnv
movq mmD, mmA
movq mmC, mmF
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
jmp short .rgb_ycc_cnv
alignx 16, 7
.columnloop:
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
movq mmD, MMWORD [esi+2*SIZEOF_MMWORD]
movq mmC, MMWORD [esi+3*SIZEOF_MMWORD]
.rgb_ycc_cnv:
; mmA=(00 10 20 30 01 11 21 31)
; mmF=(02 12 22 32 03 13 23 33)
; mmD=(04 14 24 34 05 15 25 35)
; mmC=(06 16 26 36 07 17 27 37)
movq mmB, mmA
punpcklbw mmA, mmF ; mmA=(00 02 10 12 20 22 30 32)
punpckhbw mmB, mmF ; mmB=(01 03 11 13 21 23 31 33)
movq mmG, mmD
punpcklbw mmD, mmC ; mmD=(04 06 14 16 24 26 34 36)
punpckhbw mmG, mmC ; mmG=(05 07 15 17 25 27 35 37)
movq mmE, mmA
punpcklwd mmA, mmD ; mmA=(00 02 04 06 10 12 14 16)
punpckhwd mmE, mmD ; mmE=(20 22 24 26 30 32 34 36)
movq mmH, mmB
punpcklwd mmB, mmG ; mmB=(01 03 05 07 11 13 15 17)
punpckhwd mmH, mmG ; mmH=(21 23 25 27 31 33 35 37)
pxor mmF, mmF
movq mmC, mmA
punpcklbw mmA, mmF ; mmA=(00 02 04 06)
punpckhbw mmC, mmF ; mmC=(10 12 14 16)
movq mmD, mmB
punpcklbw mmB, mmF ; mmB=(01 03 05 07)
punpckhbw mmD, mmF ; mmD=(11 13 15 17)
movq mmG, mmE
punpcklbw mmE, mmF ; mmE=(20 22 24 26)
punpckhbw mmG, mmF ; mmG=(30 32 34 36)
punpcklbw mmF, mmH
punpckhbw mmH, mmH
psrlw mmF, BYTE_BIT ; mmF=(21 23 25 27)
psrlw mmH, BYTE_BIT ; mmH=(31 33 35 37)
%endif ; RGB_PIXELSIZE ; ---------------
; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
; (Original)
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
;
; (This implementation)
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
movq MMWORD [wk(0)], mm0 ; wk(0)=RE
movq MMWORD [wk(1)], mm1 ; wk(1)=RO
movq MMWORD [wk(2)], mm4 ; wk(2)=BE
movq MMWORD [wk(3)], mm5 ; wk(3)=BO
movq mm6, mm1
punpcklwd mm1, mm3
punpckhwd mm6, mm3
movq mm7, mm1
movq mm4, mm6
pmaddwd mm1, [GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
pmaddwd mm6, [GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
pmaddwd mm7, [GOTOFF(eax,PW_MF016_MF033)] ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
pmaddwd mm4, [GOTOFF(eax,PW_MF016_MF033)] ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
movq MMWORD [wk(4)], mm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
movq MMWORD [wk(5)], mm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
pxor mm1, mm1
pxor mm6, mm6
punpcklwd mm1, mm5 ; mm1=BOL
punpckhwd mm6, mm5 ; mm6=BOH
psrld mm1, 1 ; mm1=BOL*FIX(0.500)
psrld mm6, 1 ; mm6=BOH*FIX(0.500)
movq mm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm5=[PD_ONEHALFM1_CJ]
paddd mm7, mm1
paddd mm4, mm6
paddd mm7, mm5
paddd mm4, mm5
psrld mm7, SCALEBITS ; mm7=CbOL
psrld mm4, SCALEBITS ; mm4=CbOH
packssdw mm7, mm4 ; mm7=CbO
movq mm1, MMWORD [wk(2)] ; mm1=BE
movq mm6, mm0
punpcklwd mm0, mm2
punpckhwd mm6, mm2
movq mm5, mm0
movq mm4, mm6
pmaddwd mm0, [GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
pmaddwd mm6, [GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
pmaddwd mm5, [GOTOFF(eax,PW_MF016_MF033)] ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
pmaddwd mm4, [GOTOFF(eax,PW_MF016_MF033)] ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
movq MMWORD [wk(6)], mm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
movq MMWORD [wk(7)], mm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
pxor mm0, mm0
pxor mm6, mm6
punpcklwd mm0, mm1 ; mm0=BEL
punpckhwd mm6, mm1 ; mm6=BEH
psrld mm0, 1 ; mm0=BEL*FIX(0.500)
psrld mm6, 1 ; mm6=BEH*FIX(0.500)
movq mm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
paddd mm5, mm0
paddd mm4, mm6
paddd mm5, mm1
paddd mm4, mm1
psrld mm5, SCALEBITS ; mm5=CbEL
psrld mm4, SCALEBITS ; mm4=CbEH
packssdw mm5, mm4 ; mm5=CbE
psllw mm7, BYTE_BIT
por mm5, mm7 ; mm5=Cb
movq MMWORD [ebx], mm5 ; Save Cb
movq mm0, MMWORD [wk(3)] ; mm0=BO
movq mm6, MMWORD [wk(2)] ; mm6=BE
movq mm1, MMWORD [wk(1)] ; mm1=RO
movq mm4, mm0
punpcklwd mm0, mm3
punpckhwd mm4, mm3
movq mm7, mm0
movq mm5, mm4
pmaddwd mm0, [GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
pmaddwd mm4, [GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
pmaddwd mm7, [GOTOFF(eax,PW_MF008_MF041)] ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
pmaddwd mm5, [GOTOFF(eax,PW_MF008_MF041)] ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
movq mm3, [GOTOFF(eax,PD_ONEHALF)] ; mm3=[PD_ONEHALF]
paddd mm0, MMWORD [wk(4)]
paddd mm4, MMWORD [wk(5)]
paddd mm0, mm3
paddd mm4, mm3
psrld mm0, SCALEBITS ; mm0=YOL
psrld mm4, SCALEBITS ; mm4=YOH
packssdw mm0, mm4 ; mm0=YO
pxor mm3, mm3
pxor mm4, mm4
punpcklwd mm3, mm1 ; mm3=ROL
punpckhwd mm4, mm1 ; mm4=ROH
psrld mm3, 1 ; mm3=ROL*FIX(0.500)
psrld mm4, 1 ; mm4=ROH*FIX(0.500)
movq mm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
paddd mm7, mm3
paddd mm5, mm4
paddd mm7, mm1
paddd mm5, mm1
psrld mm7, SCALEBITS ; mm7=CrOL
psrld mm5, SCALEBITS ; mm5=CrOH
packssdw mm7, mm5 ; mm7=CrO
movq mm3, MMWORD [wk(0)] ; mm3=RE
movq mm4, mm6
punpcklwd mm6, mm2
punpckhwd mm4, mm2
movq mm1, mm6
movq mm5, mm4
pmaddwd mm6, [GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
pmaddwd mm4, [GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
pmaddwd mm1, [GOTOFF(eax,PW_MF008_MF041)] ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
pmaddwd mm5, [GOTOFF(eax,PW_MF008_MF041)] ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
movq mm2, [GOTOFF(eax,PD_ONEHALF)] ; mm2=[PD_ONEHALF]
paddd mm6, MMWORD [wk(6)]
paddd mm4, MMWORD [wk(7)]
paddd mm6, mm2
paddd mm4, mm2
psrld mm6, SCALEBITS ; mm6=YEL
psrld mm4, SCALEBITS ; mm4=YEH
packssdw mm6, mm4 ; mm6=YE
psllw mm0, BYTE_BIT
por mm6, mm0 ; mm6=Y
movq MMWORD [edi], mm6 ; Save Y
pxor mm2, mm2
pxor mm4, mm4
punpcklwd mm2, mm3 ; mm2=REL
punpckhwd mm4, mm3 ; mm4=REH
psrld mm2, 1 ; mm2=REL*FIX(0.500)
psrld mm4, 1 ; mm4=REH*FIX(0.500)
movq mm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm0=[PD_ONEHALFM1_CJ]
paddd mm1, mm2
paddd mm5, mm4
paddd mm1, mm0
paddd mm5, mm0
psrld mm1, SCALEBITS ; mm1=CrEL
psrld mm5, SCALEBITS ; mm5=CrEH
packssdw mm1, mm5 ; mm1=CrE
psllw mm7, BYTE_BIT
por mm1, mm7 ; mm1=Cr
movq MMWORD [edx], mm1 ; Save Cr
sub ecx, byte SIZEOF_MMWORD
add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; inptr
add edi, byte SIZEOF_MMWORD ; outptr0
add ebx, byte SIZEOF_MMWORD ; outptr1
add edx, byte SIZEOF_MMWORD ; outptr2
cmp ecx, byte SIZEOF_MMWORD
jae near .columnloop
test ecx, ecx
jnz near .column_ld1
pop ecx ; col
pop esi
pop edi
pop ebx
pop edx
poppic eax
add esi, byte SIZEOF_JSAMPROW ; input_buf
add edi, byte SIZEOF_JSAMPROW
add ebx, byte SIZEOF_JSAMPROW
add edx, byte SIZEOF_JSAMPROW
dec eax ; num_rows
jg near .rowloop
emms ; empty MMX state
.return:
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 32

@ -0,0 +1,503 @@
;
; jccolext.asm - colorspace conversion (SSE2)
;
; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
%include "jcolsamp.inc"
; --------------------------------------------------------------------------
;
; Convert some rows of samples to the output colorspace.
;
; GLOBAL(void)
; jsimd_rgb_ycc_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
; JSAMPIMAGE output_buf, JDIMENSION output_row,
; int num_rows);
;
%define img_width(b) (b) + 8 ; JDIMENSION img_width
%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf
%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf
%define output_row(b) (b) + 20 ; JDIMENSION output_row
%define num_rows(b) (b) + 24 ; int num_rows
%define original_ebp ebp + 0
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
; xmmword wk[WK_NUM]
%define WK_NUM 8
%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
align 32
GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_sse2)
EXTN(jsimd_rgb_ycc_convert_sse2):
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic eax ; make a room for GOT address
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
get_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [img_width(eax)]
test ecx, ecx
jz near .return
push ecx
mov esi, JSAMPIMAGE [output_buf(eax)]
mov ecx, JDIMENSION [output_row(eax)]
mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
lea edi, [edi+ecx*SIZEOF_JSAMPROW]
lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
lea edx, [edx+ecx*SIZEOF_JSAMPROW]
pop ecx
mov esi, JSAMPARRAY [input_buf(eax)]
mov eax, INT [num_rows(eax)]
test eax, eax
jle near .return
alignx 16, 7
.rowloop:
pushpic eax
push edx
push ebx
push edi
push esi
push ecx ; col
mov esi, JSAMPROW [esi] ; inptr
mov edi, JSAMPROW [edi] ; outptr0
mov ebx, JSAMPROW [ebx] ; outptr1
mov edx, JSAMPROW [edx] ; outptr2
movpic eax, POINTER [gotptr] ; load GOT address (eax)
cmp ecx, byte SIZEOF_XMMWORD
jae near .columnloop
alignx 16, 7
%if RGB_PIXELSIZE == 3 ; ---------------
.column_ld1:
push eax
push edx
lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
test cl, SIZEOF_BYTE
jz short .column_ld2
sub ecx, byte SIZEOF_BYTE
movzx eax, byte [esi+ecx]
.column_ld2:
test cl, SIZEOF_WORD
jz short .column_ld4
sub ecx, byte SIZEOF_WORD
movzx edx, word [esi+ecx]
shl eax, WORD_BIT
or eax, edx
.column_ld4:
movd xmmA, eax
pop edx
pop eax
test cl, SIZEOF_DWORD
jz short .column_ld8
sub ecx, byte SIZEOF_DWORD
movd xmmF, XMM_DWORD [esi+ecx]
pslldq xmmA, SIZEOF_DWORD
por xmmA, xmmF
.column_ld8:
test cl, SIZEOF_MMWORD
jz short .column_ld16
sub ecx, byte SIZEOF_MMWORD
movq xmmB, XMM_MMWORD [esi+ecx]
pslldq xmmA, SIZEOF_MMWORD
por xmmA, xmmB
.column_ld16:
test cl, SIZEOF_XMMWORD
jz short .column_ld32
movdqa xmmF, xmmA
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
mov ecx, SIZEOF_XMMWORD
jmp short .rgb_ycc_cnv
.column_ld32:
test cl, 2*SIZEOF_XMMWORD
mov ecx, SIZEOF_XMMWORD
jz short .rgb_ycc_cnv
movdqa xmmB, xmmA
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
jmp short .rgb_ycc_cnv
alignx 16, 7
.columnloop:
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
.rgb_ycc_cnv:
; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
movdqa xmmG, xmmA
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
movdqa xmmD, xmmA
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
movdqa xmmE, xmmA
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
pxor xmmH, xmmH
movdqa xmmC, xmmA
punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmB, xmmE
punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
movdqa xmmF, xmmD
punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
%else ; RGB_PIXELSIZE == 4 ; -----------
.column_ld1:
test cl, SIZEOF_XMMWORD/16
jz short .column_ld2
sub ecx, byte SIZEOF_XMMWORD/16
movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
.column_ld2:
test cl, SIZEOF_XMMWORD/8
jz short .column_ld4
sub ecx, byte SIZEOF_XMMWORD/8
movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
pslldq xmmA, SIZEOF_MMWORD
por xmmA, xmmE
.column_ld4:
test cl, SIZEOF_XMMWORD/4
jz short .column_ld8
sub ecx, byte SIZEOF_XMMWORD/4
movdqa xmmE, xmmA
movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
.column_ld8:
test cl, SIZEOF_XMMWORD/2
mov ecx, SIZEOF_XMMWORD
jz short .rgb_ycc_cnv
movdqa xmmF, xmmA
movdqa xmmH, xmmE
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
jmp short .rgb_ycc_cnv
alignx 16, 7
.columnloop:
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
.rgb_ycc_cnv:
; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
movdqa xmmD, xmmA
punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
movdqa xmmC, xmmF
punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
movdqa xmmB, xmmA
punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
movdqa xmmG, xmmD
punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
movdqa xmmE, xmmA
punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
movdqa xmmH, xmmB
punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
pxor xmmF, xmmF
movdqa xmmC, xmmA
punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmD, xmmB
punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
movdqa xmmG, xmmE
punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
punpcklbw xmmF, xmmH
punpckhbw xmmH, xmmH
psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
%endif ; RGB_PIXELSIZE ; ---------------
; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
; (Original)
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
;
; (This implementation)
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO
movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE
movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO
movdqa xmm6, xmm1
punpcklwd xmm1, xmm3
punpckhwd xmm6, xmm3
movdqa xmm7, xmm1
movdqa xmm4, xmm6
pmaddwd xmm1, [GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
pmaddwd xmm7, [GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
pmaddwd xmm4, [GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
pxor xmm1, xmm1
pxor xmm6, xmm6
punpcklwd xmm1, xmm5 ; xmm1=BOL
punpckhwd xmm6, xmm5 ; xmm6=BOH
psrld xmm1, 1 ; xmm1=BOL*FIX(0.500)
psrld xmm6, 1 ; xmm6=BOH*FIX(0.500)
movdqa xmm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ]
paddd xmm7, xmm1
paddd xmm4, xmm6
paddd xmm7, xmm5
paddd xmm4, xmm5
psrld xmm7, SCALEBITS ; xmm7=CbOL
psrld xmm4, SCALEBITS ; xmm4=CbOH
packssdw xmm7, xmm4 ; xmm7=CbO
movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE
movdqa xmm6, xmm0
punpcklwd xmm0, xmm2
punpckhwd xmm6, xmm2
movdqa xmm5, xmm0
movdqa xmm4, xmm6
pmaddwd xmm0, [GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
pmaddwd xmm5, [GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
pmaddwd xmm4, [GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
pxor xmm0, xmm0
pxor xmm6, xmm6
punpcklwd xmm0, xmm1 ; xmm0=BEL
punpckhwd xmm6, xmm1 ; xmm6=BEH
psrld xmm0, 1 ; xmm0=BEL*FIX(0.500)
psrld xmm6, 1 ; xmm6=BEH*FIX(0.500)
movdqa xmm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
paddd xmm5, xmm0
paddd xmm4, xmm6
paddd xmm5, xmm1
paddd xmm4, xmm1
psrld xmm5, SCALEBITS ; xmm5=CbEL
psrld xmm4, SCALEBITS ; xmm4=CbEH
packssdw xmm5, xmm4 ; xmm5=CbE
psllw xmm7, BYTE_BIT
por xmm5, xmm7 ; xmm5=Cb
movdqa XMMWORD [ebx], xmm5 ; Save Cb
movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO
movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO
movdqa xmm4, xmm0
punpcklwd xmm0, xmm3
punpckhwd xmm4, xmm3
movdqa xmm7, xmm0
movdqa xmm5, xmm4
pmaddwd xmm0, [GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
pmaddwd xmm7, [GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
pmaddwd xmm5, [GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
movdqa xmm3, [GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
paddd xmm0, XMMWORD [wk(4)]
paddd xmm4, XMMWORD [wk(5)]
paddd xmm0, xmm3
paddd xmm4, xmm3
psrld xmm0, SCALEBITS ; xmm0=YOL
psrld xmm4, SCALEBITS ; xmm4=YOH
packssdw xmm0, xmm4 ; xmm0=YO
pxor xmm3, xmm3
pxor xmm4, xmm4
punpcklwd xmm3, xmm1 ; xmm3=ROL
punpckhwd xmm4, xmm1 ; xmm4=ROH
psrld xmm3, 1 ; xmm3=ROL*FIX(0.500)
psrld xmm4, 1 ; xmm4=ROH*FIX(0.500)
movdqa xmm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
paddd xmm7, xmm3
paddd xmm5, xmm4
paddd xmm7, xmm1
paddd xmm5, xmm1
psrld xmm7, SCALEBITS ; xmm7=CrOL
psrld xmm5, SCALEBITS ; xmm5=CrOH
packssdw xmm7, xmm5 ; xmm7=CrO
movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE
movdqa xmm4, xmm6
punpcklwd xmm6, xmm2
punpckhwd xmm4, xmm2
movdqa xmm1, xmm6
movdqa xmm5, xmm4
pmaddwd xmm6, [GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
pmaddwd xmm1, [GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
pmaddwd xmm5, [GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
movdqa xmm2, [GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
paddd xmm6, XMMWORD [wk(6)]
paddd xmm4, XMMWORD [wk(7)]
paddd xmm6, xmm2
paddd xmm4, xmm2
psrld xmm6, SCALEBITS ; xmm6=YEL
psrld xmm4, SCALEBITS ; xmm4=YEH
packssdw xmm6, xmm4 ; xmm6=YE
psllw xmm0, BYTE_BIT
por xmm6, xmm0 ; xmm6=Y
movdqa XMMWORD [edi], xmm6 ; Save Y
pxor xmm2, xmm2
pxor xmm4, xmm4
punpcklwd xmm2, xmm3 ; xmm2=REL
punpckhwd xmm4, xmm3 ; xmm4=REH
psrld xmm2, 1 ; xmm2=REL*FIX(0.500)
psrld xmm4, 1 ; xmm4=REH*FIX(0.500)
movdqa xmm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ]
paddd xmm1, xmm2
paddd xmm5, xmm4
paddd xmm1, xmm0
paddd xmm5, xmm0
psrld xmm1, SCALEBITS ; xmm1=CrEL
psrld xmm5, SCALEBITS ; xmm5=CrEH
packssdw xmm1, xmm5 ; xmm1=CrE
psllw xmm7, BYTE_BIT
por xmm1, xmm7 ; xmm1=Cr
movdqa XMMWORD [edx], xmm1 ; Save Cr
sub ecx, byte SIZEOF_XMMWORD
add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
add edi, byte SIZEOF_XMMWORD ; outptr0
add ebx, byte SIZEOF_XMMWORD ; outptr1
add edx, byte SIZEOF_XMMWORD ; outptr2
cmp ecx, byte SIZEOF_XMMWORD
jae near .columnloop
test ecx, ecx
jnz near .column_ld1
pop ecx ; col
pop esi
pop edi
pop ebx
pop edx
poppic eax
add esi, byte SIZEOF_JSAMPROW ; input_buf
add edi, byte SIZEOF_JSAMPROW
add ebx, byte SIZEOF_JSAMPROW
add edx, byte SIZEOF_JSAMPROW
dec eax ; num_rows
jg near .rowloop
.return:
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 32

@ -0,0 +1,121 @@
;
; jccolor.asm - colorspace conversion (AVX2)
;
; Copyright (C) 2009, 2016, D. R. Commander.
; Copyright (C) 2015, Intel Corporation.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
%include "jsimdext.inc"
; --------------------------------------------------------------------------
%define SCALEBITS 16
F_0_081 equ 5329 ; FIX(0.08131)
F_0_114 equ 7471 ; FIX(0.11400)
F_0_168 equ 11059 ; FIX(0.16874)
F_0_250 equ 16384 ; FIX(0.25000)
F_0_299 equ 19595 ; FIX(0.29900)
F_0_331 equ 21709 ; FIX(0.33126)
F_0_418 equ 27439 ; FIX(0.41869)
F_0_587 equ 38470 ; FIX(0.58700)
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
; --------------------------------------------------------------------------
SECTION SEG_CONST
alignz 32
GLOBAL_DATA(jconst_rgb_ycc_convert_avx2)
EXTN(jconst_rgb_ycc_convert_avx2):
PW_F0299_F0337 times 8 dw F_0_299, F_0_337
PW_F0114_F0250 times 8 dw F_0_114, F_0_250
PW_MF016_MF033 times 8 dw -F_0_168, -F_0_331
PW_MF008_MF041 times 8 dw -F_0_081, -F_0_418
PD_ONEHALFM1_CJ times 8 dd (1 << (SCALEBITS - 1)) - 1 + \
(CENTERJSAMPLE << SCALEBITS)
PD_ONEHALF times 8 dd (1 << (SCALEBITS - 1))
alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
%include "jccolext-avx2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_RGB_RED
%define RGB_GREEN EXT_RGB_GREEN
%define RGB_BLUE EXT_RGB_BLUE
%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
%define jsimd_rgb_ycc_convert_avx2 jsimd_extrgb_ycc_convert_avx2
%include "jccolext-avx2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_RGBX_RED
%define RGB_GREEN EXT_RGBX_GREEN
%define RGB_BLUE EXT_RGBX_BLUE
%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
%define jsimd_rgb_ycc_convert_avx2 jsimd_extrgbx_ycc_convert_avx2
%include "jccolext-avx2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_BGR_RED
%define RGB_GREEN EXT_BGR_GREEN
%define RGB_BLUE EXT_BGR_BLUE
%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
%define jsimd_rgb_ycc_convert_avx2 jsimd_extbgr_ycc_convert_avx2
%include "jccolext-avx2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_BGRX_RED
%define RGB_GREEN EXT_BGRX_GREEN
%define RGB_BLUE EXT_BGRX_BLUE
%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
%define jsimd_rgb_ycc_convert_avx2 jsimd_extbgrx_ycc_convert_avx2
%include "jccolext-avx2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_XBGR_RED
%define RGB_GREEN EXT_XBGR_GREEN
%define RGB_BLUE EXT_XBGR_BLUE
%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
%define jsimd_rgb_ycc_convert_avx2 jsimd_extxbgr_ycc_convert_avx2
%include "jccolext-avx2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_XRGB_RED
%define RGB_GREEN EXT_XRGB_GREEN
%define RGB_BLUE EXT_XRGB_BLUE
%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
%define jsimd_rgb_ycc_convert_avx2 jsimd_extxrgb_ycc_convert_avx2
%include "jccolext-avx2.asm"

@ -0,0 +1,121 @@
;
; jccolor.asm - colorspace conversion (MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
%include "jsimdext.inc"
; --------------------------------------------------------------------------
%define SCALEBITS 16
F_0_081 equ 5329 ; FIX(0.08131)
F_0_114 equ 7471 ; FIX(0.11400)
F_0_168 equ 11059 ; FIX(0.16874)
F_0_250 equ 16384 ; FIX(0.25000)
F_0_299 equ 19595 ; FIX(0.29900)
F_0_331 equ 21709 ; FIX(0.33126)
F_0_418 equ 27439 ; FIX(0.41869)
F_0_587 equ 38470 ; FIX(0.58700)
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
; --------------------------------------------------------------------------
SECTION SEG_CONST
alignz 32
GLOBAL_DATA(jconst_rgb_ycc_convert_mmx)
EXTN(jconst_rgb_ycc_convert_mmx):
PW_F0299_F0337 times 2 dw F_0_299, F_0_337
PW_F0114_F0250 times 2 dw F_0_114, F_0_250
PW_MF016_MF033 times 2 dw -F_0_168, -F_0_331
PW_MF008_MF041 times 2 dw -F_0_081, -F_0_418
PD_ONEHALFM1_CJ times 2 dd (1 << (SCALEBITS - 1)) - 1 + \
(CENTERJSAMPLE << SCALEBITS)
PD_ONEHALF times 2 dd (1 << (SCALEBITS - 1))
alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
%include "jccolext-mmx.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_RGB_RED
%define RGB_GREEN EXT_RGB_GREEN
%define RGB_BLUE EXT_RGB_BLUE
%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
%define jsimd_rgb_ycc_convert_mmx jsimd_extrgb_ycc_convert_mmx
%include "jccolext-mmx.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_RGBX_RED
%define RGB_GREEN EXT_RGBX_GREEN
%define RGB_BLUE EXT_RGBX_BLUE
%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
%define jsimd_rgb_ycc_convert_mmx jsimd_extrgbx_ycc_convert_mmx
%include "jccolext-mmx.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_BGR_RED
%define RGB_GREEN EXT_BGR_GREEN
%define RGB_BLUE EXT_BGR_BLUE
%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
%define jsimd_rgb_ycc_convert_mmx jsimd_extbgr_ycc_convert_mmx
%include "jccolext-mmx.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_BGRX_RED
%define RGB_GREEN EXT_BGRX_GREEN
%define RGB_BLUE EXT_BGRX_BLUE
%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
%define jsimd_rgb_ycc_convert_mmx jsimd_extbgrx_ycc_convert_mmx
%include "jccolext-mmx.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_XBGR_RED
%define RGB_GREEN EXT_XBGR_GREEN
%define RGB_BLUE EXT_XBGR_BLUE
%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
%define jsimd_rgb_ycc_convert_mmx jsimd_extxbgr_ycc_convert_mmx
%include "jccolext-mmx.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_XRGB_RED
%define RGB_GREEN EXT_XRGB_GREEN
%define RGB_BLUE EXT_XRGB_BLUE
%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
%define jsimd_rgb_ycc_convert_mmx jsimd_extxrgb_ycc_convert_mmx
%include "jccolext-mmx.asm"

@ -0,0 +1,120 @@
;
; jccolor.asm - colorspace conversion (SSE2)
;
; Copyright (C) 2009, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
%include "jsimdext.inc"
; --------------------------------------------------------------------------
%define SCALEBITS 16
F_0_081 equ 5329 ; FIX(0.08131)
F_0_114 equ 7471 ; FIX(0.11400)
F_0_168 equ 11059 ; FIX(0.16874)
F_0_250 equ 16384 ; FIX(0.25000)
F_0_299 equ 19595 ; FIX(0.29900)
F_0_331 equ 21709 ; FIX(0.33126)
F_0_418 equ 27439 ; FIX(0.41869)
F_0_587 equ 38470 ; FIX(0.58700)
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
; --------------------------------------------------------------------------
SECTION SEG_CONST
alignz 32
GLOBAL_DATA(jconst_rgb_ycc_convert_sse2)
EXTN(jconst_rgb_ycc_convert_sse2):
PW_F0299_F0337 times 4 dw F_0_299, F_0_337
PW_F0114_F0250 times 4 dw F_0_114, F_0_250
PW_MF016_MF033 times 4 dw -F_0_168, -F_0_331
PW_MF008_MF041 times 4 dw -F_0_081, -F_0_418
PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS - 1)) - 1 + \
(CENTERJSAMPLE << SCALEBITS)
PD_ONEHALF times 4 dd (1 << (SCALEBITS - 1))
alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
%include "jccolext-sse2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_RGB_RED
%define RGB_GREEN EXT_RGB_GREEN
%define RGB_BLUE EXT_RGB_BLUE
%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2
%include "jccolext-sse2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_RGBX_RED
%define RGB_GREEN EXT_RGBX_GREEN
%define RGB_BLUE EXT_RGBX_BLUE
%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2
%include "jccolext-sse2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_BGR_RED
%define RGB_GREEN EXT_BGR_GREEN
%define RGB_BLUE EXT_BGR_BLUE
%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2
%include "jccolext-sse2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_BGRX_RED
%define RGB_GREEN EXT_BGRX_GREEN
%define RGB_BLUE EXT_BGRX_BLUE
%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2
%include "jccolext-sse2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_XBGR_RED
%define RGB_GREEN EXT_XBGR_GREEN
%define RGB_BLUE EXT_XBGR_BLUE
%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
%define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2
%include "jccolext-sse2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_XRGB_RED
%define RGB_GREEN EXT_XRGB_GREEN
%define RGB_BLUE EXT_XRGB_BLUE
%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
%define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2
%include "jccolext-sse2.asm"

@ -0,0 +1,113 @@
;
; jcgray.asm - grayscale colorspace conversion (AVX2)
;
; Copyright (C) 2011, 2016, D. R. Commander.
; Copyright (C) 2015, Intel Corporation.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
%include "jsimdext.inc"
; --------------------------------------------------------------------------
%define SCALEBITS 16
F_0_114 equ 7471 ; FIX(0.11400)
F_0_250 equ 16384 ; FIX(0.25000)
F_0_299 equ 19595 ; FIX(0.29900)
F_0_587 equ 38470 ; FIX(0.58700)
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
; --------------------------------------------------------------------------
SECTION SEG_CONST
alignz 32
GLOBAL_DATA(jconst_rgb_gray_convert_avx2)
EXTN(jconst_rgb_gray_convert_avx2):
PW_F0299_F0337 times 8 dw F_0_299, F_0_337
PW_F0114_F0250 times 8 dw F_0_114, F_0_250
PD_ONEHALF times 8 dd (1 << (SCALEBITS - 1))
alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
%include "jcgryext-avx2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_RGB_RED
%define RGB_GREEN EXT_RGB_GREEN
%define RGB_BLUE EXT_RGB_BLUE
%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
%define jsimd_rgb_gray_convert_avx2 jsimd_extrgb_gray_convert_avx2
%include "jcgryext-avx2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_RGBX_RED
%define RGB_GREEN EXT_RGBX_GREEN
%define RGB_BLUE EXT_RGBX_BLUE
%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
%define jsimd_rgb_gray_convert_avx2 jsimd_extrgbx_gray_convert_avx2
%include "jcgryext-avx2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_BGR_RED
%define RGB_GREEN EXT_BGR_GREEN
%define RGB_BLUE EXT_BGR_BLUE
%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
%define jsimd_rgb_gray_convert_avx2 jsimd_extbgr_gray_convert_avx2
%include "jcgryext-avx2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_BGRX_RED
%define RGB_GREEN EXT_BGRX_GREEN
%define RGB_BLUE EXT_BGRX_BLUE
%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
%define jsimd_rgb_gray_convert_avx2 jsimd_extbgrx_gray_convert_avx2
%include "jcgryext-avx2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_XBGR_RED
%define RGB_GREEN EXT_XBGR_GREEN
%define RGB_BLUE EXT_XBGR_BLUE
%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
%define jsimd_rgb_gray_convert_avx2 jsimd_extxbgr_gray_convert_avx2
%include "jcgryext-avx2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_XRGB_RED
%define RGB_GREEN EXT_XRGB_GREEN
%define RGB_BLUE EXT_XRGB_BLUE
%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
%define jsimd_rgb_gray_convert_avx2 jsimd_extxrgb_gray_convert_avx2
%include "jcgryext-avx2.asm"

@ -0,0 +1,113 @@
;
; jcgray.asm - grayscale colorspace conversion (MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2011, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
%include "jsimdext.inc"
; --------------------------------------------------------------------------
%define SCALEBITS 16
F_0_114 equ 7471 ; FIX(0.11400)
F_0_250 equ 16384 ; FIX(0.25000)
F_0_299 equ 19595 ; FIX(0.29900)
F_0_587 equ 38470 ; FIX(0.58700)
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
; --------------------------------------------------------------------------
SECTION SEG_CONST
alignz 32
GLOBAL_DATA(jconst_rgb_gray_convert_mmx)
EXTN(jconst_rgb_gray_convert_mmx):
PW_F0299_F0337 times 2 dw F_0_299, F_0_337
PW_F0114_F0250 times 2 dw F_0_114, F_0_250
PD_ONEHALF times 2 dd (1 << (SCALEBITS - 1))
alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
%include "jcgryext-mmx.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_RGB_RED
%define RGB_GREEN EXT_RGB_GREEN
%define RGB_BLUE EXT_RGB_BLUE
%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
%define jsimd_rgb_gray_convert_mmx jsimd_extrgb_gray_convert_mmx
%include "jcgryext-mmx.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_RGBX_RED
%define RGB_GREEN EXT_RGBX_GREEN
%define RGB_BLUE EXT_RGBX_BLUE
%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
%define jsimd_rgb_gray_convert_mmx jsimd_extrgbx_gray_convert_mmx
%include "jcgryext-mmx.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_BGR_RED
%define RGB_GREEN EXT_BGR_GREEN
%define RGB_BLUE EXT_BGR_BLUE
%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
%define jsimd_rgb_gray_convert_mmx jsimd_extbgr_gray_convert_mmx
%include "jcgryext-mmx.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_BGRX_RED
%define RGB_GREEN EXT_BGRX_GREEN
%define RGB_BLUE EXT_BGRX_BLUE
%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
%define jsimd_rgb_gray_convert_mmx jsimd_extbgrx_gray_convert_mmx
%include "jcgryext-mmx.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_XBGR_RED
%define RGB_GREEN EXT_XBGR_GREEN
%define RGB_BLUE EXT_XBGR_BLUE
%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
%define jsimd_rgb_gray_convert_mmx jsimd_extxbgr_gray_convert_mmx
%include "jcgryext-mmx.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_XRGB_RED
%define RGB_GREEN EXT_XRGB_GREEN
%define RGB_BLUE EXT_XRGB_BLUE
%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
%define jsimd_rgb_gray_convert_mmx jsimd_extxrgb_gray_convert_mmx
%include "jcgryext-mmx.asm"

@ -0,0 +1,112 @@
;
; jcgray.asm - grayscale colorspace conversion (SSE2)
;
; Copyright (C) 2011, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
%include "jsimdext.inc"
; --------------------------------------------------------------------------
%define SCALEBITS 16
F_0_114 equ 7471 ; FIX(0.11400)
F_0_250 equ 16384 ; FIX(0.25000)
F_0_299 equ 19595 ; FIX(0.29900)
F_0_587 equ 38470 ; FIX(0.58700)
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
; --------------------------------------------------------------------------
SECTION SEG_CONST
alignz 32
GLOBAL_DATA(jconst_rgb_gray_convert_sse2)
EXTN(jconst_rgb_gray_convert_sse2):
PW_F0299_F0337 times 4 dw F_0_299, F_0_337
PW_F0114_F0250 times 4 dw F_0_114, F_0_250
PD_ONEHALF times 4 dd (1 << (SCALEBITS - 1))
alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
%include "jcgryext-sse2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_RGB_RED
%define RGB_GREEN EXT_RGB_GREEN
%define RGB_BLUE EXT_RGB_BLUE
%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
%define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2
%include "jcgryext-sse2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_RGBX_RED
%define RGB_GREEN EXT_RGBX_GREEN
%define RGB_BLUE EXT_RGBX_BLUE
%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
%define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2
%include "jcgryext-sse2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_BGR_RED
%define RGB_GREEN EXT_BGR_GREEN
%define RGB_BLUE EXT_BGR_BLUE
%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
%define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2
%include "jcgryext-sse2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_BGRX_RED
%define RGB_GREEN EXT_BGRX_GREEN
%define RGB_BLUE EXT_BGRX_BLUE
%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
%define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2
%include "jcgryext-sse2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_XBGR_RED
%define RGB_GREEN EXT_XBGR_GREEN
%define RGB_BLUE EXT_XBGR_BLUE
%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
%define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2
%include "jcgryext-sse2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_XRGB_RED
%define RGB_GREEN EXT_XRGB_GREEN
%define RGB_BLUE EXT_XRGB_BLUE
%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
%define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2
%include "jcgryext-sse2.asm"

@ -0,0 +1,457 @@
;
; jcgryext.asm - grayscale colorspace conversion (AVX2)
;
; Copyright (C) 2011, 2016, D. R. Commander.
; Copyright (C) 2015, Intel Corporation.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
%include "jcolsamp.inc"
; --------------------------------------------------------------------------
;
; Convert some rows of samples to the output colorspace.
;
; GLOBAL(void)
; jsimd_rgb_gray_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
; JSAMPIMAGE output_buf, JDIMENSION output_row,
; int num_rows);
;
%define img_width(b) (b) + 8 ; JDIMENSION img_width
%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf
%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf
%define output_row(b) (b) + 20 ; JDIMENSION output_row
%define num_rows(b) (b) + 24 ; int num_rows
%define original_ebp ebp + 0
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
; ymmword wk[WK_NUM]
%define WK_NUM 2
%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
align 32
GLOBAL_FUNCTION(jsimd_rgb_gray_convert_avx2)
EXTN(jsimd_rgb_gray_convert_avx2):
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic eax ; make a room for GOT address
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
get_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [img_width(eax)]
test ecx, ecx
jz near .return
push ecx
mov esi, JSAMPIMAGE [output_buf(eax)]
mov ecx, JDIMENSION [output_row(eax)]
mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
lea edi, [edi+ecx*SIZEOF_JSAMPROW]
pop ecx
mov esi, JSAMPARRAY [input_buf(eax)]
mov eax, INT [num_rows(eax)]
test eax, eax
jle near .return
alignx 16, 7
.rowloop:
pushpic eax
push edi
push esi
push ecx ; col
mov esi, JSAMPROW [esi] ; inptr
mov edi, JSAMPROW [edi] ; outptr0
movpic eax, POINTER [gotptr] ; load GOT address (eax)
cmp ecx, byte SIZEOF_YMMWORD
jae near .columnloop
alignx 16, 7
%if RGB_PIXELSIZE == 3 ; ---------------
.column_ld1:
push eax
push edx
lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
test cl, SIZEOF_BYTE
jz short .column_ld2
sub ecx, byte SIZEOF_BYTE
movzx eax, byte [esi+ecx]
.column_ld2:
test cl, SIZEOF_WORD
jz short .column_ld4
sub ecx, byte SIZEOF_WORD
movzx edx, word [esi+ecx]
shl eax, WORD_BIT
or eax, edx
.column_ld4:
vmovd xmmA, eax
pop edx
pop eax
test cl, SIZEOF_DWORD
jz short .column_ld8
sub ecx, byte SIZEOF_DWORD
vmovd xmmF, XMM_DWORD [esi+ecx]
vpslldq xmmA, xmmA, SIZEOF_DWORD
vpor xmmA, xmmA, xmmF
.column_ld8:
test cl, SIZEOF_MMWORD
jz short .column_ld16
sub ecx, byte SIZEOF_MMWORD
vmovq xmmB, XMM_MMWORD [esi+ecx]
vpslldq xmmA, xmmA, SIZEOF_MMWORD
vpor xmmA, xmmA, xmmB
.column_ld16:
test cl, SIZEOF_XMMWORD
jz short .column_ld32
sub ecx, byte SIZEOF_XMMWORD
vmovdqu xmmB, XMM_MMWORD [esi+ecx]
vperm2i128 ymmA, ymmA, ymmA, 1
vpor ymmA, ymmB
.column_ld32:
test cl, SIZEOF_YMMWORD
jz short .column_ld64
sub ecx, byte SIZEOF_YMMWORD
vmovdqa ymmF, ymmA
vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
.column_ld64:
test cl, 2*SIZEOF_YMMWORD
mov ecx, SIZEOF_YMMWORD
jz short .rgb_gray_cnv
vmovdqa ymmB, ymmA
vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
jmp short .rgb_gray_cnv
alignx 16, 7
.columnloop:
vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
vmovdqu ymmB, YMMWORD [esi+2*SIZEOF_YMMWORD]
.rgb_gray_cnv:
; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
vmovdqu ymmC, ymmA
vinserti128 ymmA, ymmF, xmmA, 0 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
vinserti128 ymmC, ymmC, xmmB, 0 ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
vinserti128 ymmB, ymmB, xmmF, 0 ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
vperm2i128 ymmF, ymmC, ymmC, 1 ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
vmovdqa ymmG, ymmA
vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
; 22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
vpsrldq ymmG, ymmG, 8 ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
; 2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
vpunpckhbw ymmA, ymmA, ymmF ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
; 0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
vpslldq ymmF, ymmF, 8 ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
; 08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
vpunpcklbw ymmG, ymmG, ymmB ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
; 2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
vpunpckhbw ymmF, ymmF, ymmB ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
; 1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
vmovdqa ymmD, ymmA
vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
; 11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
vpsrldq ymmD, ymmD, 8 ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
; 1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
vpunpckhbw ymmA, ymmA, ymmG ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
vpslldq ymmG, ymmG, 8 ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
; 04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
; 1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
; 2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
vmovdqa ymmE, ymmA
vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
; 20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
vpsrldq ymmE, ymmE, 8 ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
; 2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
vpunpckhbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
vpslldq ymmD, ymmD, 8 ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
; 02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
; 2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
vpunpckhbw ymmD, ymmD, ymmG ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
; 1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
vpxor ymmH, ymmH, ymmH
vmovdqa ymmC, ymmA
vpunpcklbw ymmA, ymmA, ymmH ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
vmovdqa ymmB, ymmE
vpunpcklbw ymmE, ymmE, ymmH ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
vpunpckhbw ymmB, ymmB, ymmH ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
vmovdqa ymmF, ymmD
vpunpcklbw ymmD, ymmD, ymmH ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
vpunpckhbw ymmF, ymmF, ymmH ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
%else ; RGB_PIXELSIZE == 4 ; -----------
.column_ld1:
test cl, SIZEOF_XMMWORD/16
jz short .column_ld2
sub ecx, byte SIZEOF_XMMWORD/16
vmovd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
.column_ld2:
test cl, SIZEOF_XMMWORD/8
jz short .column_ld4
sub ecx, byte SIZEOF_XMMWORD/8
vmovq xmmF, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
vpslldq xmmA, xmmA, SIZEOF_MMWORD
vpor xmmA, xmmA, xmmF
.column_ld4:
test cl, SIZEOF_XMMWORD/4
jz short .column_ld8
sub ecx, byte SIZEOF_XMMWORD/4
vmovdqa xmmF, xmmA
vperm2i128 ymmF, ymmF, ymmF, 1
vmovdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
vpor ymmA, ymmA, ymmF
.column_ld8:
test cl, SIZEOF_XMMWORD/2
jz short .column_ld16
sub ecx, byte SIZEOF_XMMWORD/2
vmovdqa ymmF, ymmA
vmovdqu ymmA, YMMWORD [esi+ecx*RGB_PIXELSIZE]
.column_ld16:
test cl, SIZEOF_XMMWORD
mov ecx, SIZEOF_YMMWORD
jz short .rgb_gray_cnv
vmovdqa ymmE, ymmA
vmovdqa ymmH, ymmF
vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
jmp short .rgb_gray_cnv
alignx 16, 7
.columnloop:
vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
vmovdqu ymmE, YMMWORD [esi+2*SIZEOF_YMMWORD]
vmovdqu ymmH, YMMWORD [esi+3*SIZEOF_YMMWORD]
.rgb_gray_cnv:
; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
vmovdqa ymmB, ymmA
vinserti128 ymmA, ymmA, xmmE, 1 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
vperm2i128 ymmE, ymmB, ymmE, 0x31 ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
vmovdqa ymmB, ymmF
vinserti128 ymmF, ymmF, xmmH, 1 ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
vperm2i128 ymmH, ymmB, ymmH, 0x31 ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
vmovdqa ymmD, ymmA
vpunpcklbw ymmA, ymmA, ymmE ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
; 0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
vpunpckhbw ymmD, ymmD, ymmE ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
; 0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
vmovdqa ymmC, ymmF
vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
; 0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
; 0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
vmovdqa ymmB, ymmA
vpunpcklwd ymmA, ymmA, ymmF ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
vpunpckhwd ymmB, ymmB, ymmF ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
; 0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
vmovdqa ymmG, ymmD
vpunpcklwd ymmD, ymmD, ymmC ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
; 0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
vpunpckhwd ymmG, ymmG, ymmC ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
; 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
vmovdqa ymmE, ymmA
vpunpcklbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
vpunpckhbw ymmE, ymmE, ymmD ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
; 2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
vmovdqa ymmH, ymmB
vpunpcklbw ymmB, ymmB, ymmG ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
; 0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
vpunpckhbw ymmH, ymmH, ymmG ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
; 2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
vpxor ymmF, ymmF, ymmF
vmovdqa ymmC, ymmA
vpunpcklbw ymmA, ymmA, ymmF ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
vpunpckhbw ymmC, ymmC, ymmF ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
vmovdqa ymmD, ymmB
vpunpcklbw ymmB, ymmB, ymmF ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
vpunpckhbw ymmD, ymmD, ymmF ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
vmovdqa ymmG, ymmE
vpunpcklbw ymmE, ymmE, ymmF ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
vpunpcklbw ymmF, ymmF, ymmH
vpunpckhbw ymmH, ymmH, ymmH
vpsrlw ymmF, ymmF, BYTE_BIT ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
vpsrlw ymmH, ymmH, BYTE_BIT ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
%endif ; RGB_PIXELSIZE ; ---------------
; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
; (Original)
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
;
; (This implementation)
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
vmovdqa ymm6, ymm1
vpunpcklwd ymm1, ymm1, ymm3
vpunpckhwd ymm6, ymm6, ymm3
vpmaddwd ymm1, ymm1, [GOTOFF(eax,PW_F0299_F0337)] ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)] ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
vmovdqa ymm7, ymm6 ; ymm7=ROH*FIX(0.299)+GOH*FIX(0.337)
vmovdqa ymm6, ymm0
vpunpcklwd ymm0, ymm0, ymm2
vpunpckhwd ymm6, ymm6, ymm2
vpmaddwd ymm0, ymm0, [GOTOFF(eax,PW_F0299_F0337)] ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)] ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
vmovdqa YMMWORD [wk(0)], ymm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
vmovdqa YMMWORD [wk(1)], ymm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
vmovdqa ymm0, ymm5 ; ymm0=BO
vmovdqa ymm6, ymm4 ; ymm6=BE
vmovdqa ymm4, ymm0
vpunpcklwd ymm0, ymm0, ymm3
vpunpckhwd ymm4, ymm4, ymm3
vpmaddwd ymm0, ymm0, [GOTOFF(eax,PW_F0114_F0250)] ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)] ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
vmovdqa ymm3, [GOTOFF(eax,PD_ONEHALF)] ; ymm3=[PD_ONEHALF]
vpaddd ymm0, ymm0, ymm1
vpaddd ymm4, ymm4, ymm7
vpaddd ymm0, ymm0, ymm3
vpaddd ymm4, ymm4, ymm3
vpsrld ymm0, ymm0, SCALEBITS ; ymm0=YOL
vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YOH
vpackssdw ymm0, ymm0, ymm4 ; ymm0=YO
vmovdqa ymm4, ymm6
vpunpcklwd ymm6, ymm6, ymm2
vpunpckhwd ymm4, ymm4, ymm2
vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_F0114_F0250)] ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)] ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
vmovdqa ymm2, [GOTOFF(eax,PD_ONEHALF)] ; ymm2=[PD_ONEHALF]
vpaddd ymm6, ymm6, YMMWORD [wk(0)]
vpaddd ymm4, ymm4, YMMWORD [wk(1)]
vpaddd ymm6, ymm6, ymm2
vpaddd ymm4, ymm4, ymm2
vpsrld ymm6, ymm6, SCALEBITS ; ymm6=YEL
vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YEH
vpackssdw ymm6, ymm6, ymm4 ; ymm6=YE
vpsllw ymm0, ymm0, BYTE_BIT
vpor ymm6, ymm6, ymm0 ; ymm6=Y
vmovdqu YMMWORD [edi], ymm6 ; Save Y
sub ecx, byte SIZEOF_YMMWORD
add esi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; inptr
add edi, byte SIZEOF_YMMWORD ; outptr0
cmp ecx, byte SIZEOF_YMMWORD
jae near .columnloop
test ecx, ecx
jnz near .column_ld1
pop ecx ; col
pop esi
pop edi
poppic eax
add esi, byte SIZEOF_JSAMPROW ; input_buf
add edi, byte SIZEOF_JSAMPROW
dec eax ; num_rows
jg near .rowloop
.return:
vzeroupper
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 32

@ -0,0 +1,355 @@
;
; jcgryext.asm - grayscale colorspace conversion (MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2011, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
%include "jcolsamp.inc"
; --------------------------------------------------------------------------
;
; Convert some rows of samples to the output colorspace.
;
; GLOBAL(void)
; jsimd_rgb_gray_convert_mmx(JDIMENSION img_width, JSAMPARRAY input_buf,
; JSAMPIMAGE output_buf, JDIMENSION output_row,
; int num_rows);
;
%define img_width(b) (b) + 8 ; JDIMENSION img_width
%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf
%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf
%define output_row(b) (b) + 20 ; JDIMENSION output_row
%define num_rows(b) (b) + 24 ; int num_rows
%define original_ebp ebp + 0
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
; mmword wk[WK_NUM]
%define WK_NUM 2
%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
align 32
GLOBAL_FUNCTION(jsimd_rgb_gray_convert_mmx)
EXTN(jsimd_rgb_gray_convert_mmx):
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic eax ; make a room for GOT address
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
get_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [img_width(eax)] ; num_cols
test ecx, ecx
jz near .return
push ecx
mov esi, JSAMPIMAGE [output_buf(eax)]
mov ecx, JDIMENSION [output_row(eax)]
mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
lea edi, [edi+ecx*SIZEOF_JSAMPROW]
pop ecx
mov esi, JSAMPARRAY [input_buf(eax)]
mov eax, INT [num_rows(eax)]
test eax, eax
jle near .return
alignx 16, 7
.rowloop:
pushpic eax
push edi
push esi
push ecx ; col
mov esi, JSAMPROW [esi] ; inptr
mov edi, JSAMPROW [edi] ; outptr0
movpic eax, POINTER [gotptr] ; load GOT address (eax)
cmp ecx, byte SIZEOF_MMWORD
jae short .columnloop
alignx 16, 7
%if RGB_PIXELSIZE == 3 ; ---------------
.column_ld1:
push eax
push edx
lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
test cl, SIZEOF_BYTE
jz short .column_ld2
sub ecx, byte SIZEOF_BYTE
xor eax, eax
mov al, byte [esi+ecx]
.column_ld2:
test cl, SIZEOF_WORD
jz short .column_ld4
sub ecx, byte SIZEOF_WORD
xor edx, edx
mov dx, word [esi+ecx]
shl eax, WORD_BIT
or eax, edx
.column_ld4:
movd mmA, eax
pop edx
pop eax
test cl, SIZEOF_DWORD
jz short .column_ld8
sub ecx, byte SIZEOF_DWORD
movd mmG, dword [esi+ecx]
psllq mmA, DWORD_BIT
por mmA, mmG
.column_ld8:
test cl, SIZEOF_MMWORD
jz short .column_ld16
movq mmG, mmA
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
mov ecx, SIZEOF_MMWORD
jmp short .rgb_gray_cnv
.column_ld16:
test cl, 2*SIZEOF_MMWORD
mov ecx, SIZEOF_MMWORD
jz short .rgb_gray_cnv
movq mmF, mmA
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
jmp short .rgb_gray_cnv
alignx 16, 7
.columnloop:
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
movq mmF, MMWORD [esi+2*SIZEOF_MMWORD]
.rgb_gray_cnv:
; mmA=(00 10 20 01 11 21 02 12)
; mmG=(22 03 13 23 04 14 24 05)
; mmF=(15 25 06 16 26 07 17 27)
movq mmD, mmA
psllq mmA, 4*BYTE_BIT ; mmA=(-- -- -- -- 00 10 20 01)
psrlq mmD, 4*BYTE_BIT ; mmD=(11 21 02 12 -- -- -- --)
punpckhbw mmA, mmG ; mmA=(00 04 10 14 20 24 01 05)
psllq mmG, 4*BYTE_BIT ; mmG=(-- -- -- -- 22 03 13 23)
punpcklbw mmD, mmF ; mmD=(11 15 21 25 02 06 12 16)
punpckhbw mmG, mmF ; mmG=(22 26 03 07 13 17 23 27)
movq mmE, mmA
psllq mmA, 4*BYTE_BIT ; mmA=(-- -- -- -- 00 04 10 14)
psrlq mmE, 4*BYTE_BIT ; mmE=(20 24 01 05 -- -- -- --)
punpckhbw mmA, mmD ; mmA=(00 02 04 06 10 12 14 16)
psllq mmD, 4*BYTE_BIT ; mmD=(-- -- -- -- 11 15 21 25)
punpcklbw mmE, mmG ; mmE=(20 22 24 26 01 03 05 07)
punpckhbw mmD, mmG ; mmD=(11 13 15 17 21 23 25 27)
pxor mmH, mmH
movq mmC, mmA
punpcklbw mmA, mmH ; mmA=(00 02 04 06)
punpckhbw mmC, mmH ; mmC=(10 12 14 16)
movq mmB, mmE
punpcklbw mmE, mmH ; mmE=(20 22 24 26)
punpckhbw mmB, mmH ; mmB=(01 03 05 07)
movq mmF, mmD
punpcklbw mmD, mmH ; mmD=(11 13 15 17)
punpckhbw mmF, mmH ; mmF=(21 23 25 27)
%else ; RGB_PIXELSIZE == 4 ; -----------
.column_ld1:
test cl, SIZEOF_MMWORD/8
jz short .column_ld2
sub ecx, byte SIZEOF_MMWORD/8
movd mmA, dword [esi+ecx*RGB_PIXELSIZE]
.column_ld2:
test cl, SIZEOF_MMWORD/4
jz short .column_ld4
sub ecx, byte SIZEOF_MMWORD/4
movq mmF, mmA
movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
.column_ld4:
test cl, SIZEOF_MMWORD/2
mov ecx, SIZEOF_MMWORD
jz short .rgb_gray_cnv
movq mmD, mmA
movq mmC, mmF
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
jmp short .rgb_gray_cnv
alignx 16, 7
.columnloop:
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
movq mmD, MMWORD [esi+2*SIZEOF_MMWORD]
movq mmC, MMWORD [esi+3*SIZEOF_MMWORD]
.rgb_gray_cnv:
; mmA=(00 10 20 30 01 11 21 31)
; mmF=(02 12 22 32 03 13 23 33)
; mmD=(04 14 24 34 05 15 25 35)
; mmC=(06 16 26 36 07 17 27 37)
movq mmB, mmA
punpcklbw mmA, mmF ; mmA=(00 02 10 12 20 22 30 32)
punpckhbw mmB, mmF ; mmB=(01 03 11 13 21 23 31 33)
movq mmG, mmD
punpcklbw mmD, mmC ; mmD=(04 06 14 16 24 26 34 36)
punpckhbw mmG, mmC ; mmG=(05 07 15 17 25 27 35 37)
movq mmE, mmA
punpcklwd mmA, mmD ; mmA=(00 02 04 06 10 12 14 16)
punpckhwd mmE, mmD ; mmE=(20 22 24 26 30 32 34 36)
movq mmH, mmB
punpcklwd mmB, mmG ; mmB=(01 03 05 07 11 13 15 17)
punpckhwd mmH, mmG ; mmH=(21 23 25 27 31 33 35 37)
pxor mmF, mmF
movq mmC, mmA
punpcklbw mmA, mmF ; mmA=(00 02 04 06)
punpckhbw mmC, mmF ; mmC=(10 12 14 16)
movq mmD, mmB
punpcklbw mmB, mmF ; mmB=(01 03 05 07)
punpckhbw mmD, mmF ; mmD=(11 13 15 17)
movq mmG, mmE
punpcklbw mmE, mmF ; mmE=(20 22 24 26)
punpckhbw mmG, mmF ; mmG=(30 32 34 36)
punpcklbw mmF, mmH
punpckhbw mmH, mmH
psrlw mmF, BYTE_BIT ; mmF=(21 23 25 27)
psrlw mmH, BYTE_BIT ; mmH=(31 33 35 37)
%endif ; RGB_PIXELSIZE ; ---------------
; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
; (Original)
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
;
; (This implementation)
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
movq mm6, mm1
punpcklwd mm1, mm3
punpckhwd mm6, mm3
pmaddwd mm1, [GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
pmaddwd mm6, [GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
movq mm7, mm6 ; mm7=ROH*FIX(0.299)+GOH*FIX(0.337)
movq mm6, mm0
punpcklwd mm0, mm2
punpckhwd mm6, mm2
pmaddwd mm0, [GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
pmaddwd mm6, [GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
movq MMWORD [wk(0)], mm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
movq MMWORD [wk(1)], mm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
movq mm0, mm5 ; mm0=BO
movq mm6, mm4 ; mm6=BE
movq mm4, mm0
punpcklwd mm0, mm3
punpckhwd mm4, mm3
pmaddwd mm0, [GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
pmaddwd mm4, [GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
movq mm3, [GOTOFF(eax,PD_ONEHALF)] ; mm3=[PD_ONEHALF]
paddd mm0, mm1
paddd mm4, mm7
paddd mm0, mm3
paddd mm4, mm3
psrld mm0, SCALEBITS ; mm0=YOL
psrld mm4, SCALEBITS ; mm4=YOH
packssdw mm0, mm4 ; mm0=YO
movq mm4, mm6
punpcklwd mm6, mm2
punpckhwd mm4, mm2
pmaddwd mm6, [GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
pmaddwd mm4, [GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
movq mm2, [GOTOFF(eax,PD_ONEHALF)] ; mm2=[PD_ONEHALF]
paddd mm6, MMWORD [wk(0)]
paddd mm4, MMWORD [wk(1)]
paddd mm6, mm2
paddd mm4, mm2
psrld mm6, SCALEBITS ; mm6=YEL
psrld mm4, SCALEBITS ; mm4=YEH
packssdw mm6, mm4 ; mm6=YE
psllw mm0, BYTE_BIT
por mm6, mm0 ; mm6=Y
movq MMWORD [edi], mm6 ; Save Y
sub ecx, byte SIZEOF_MMWORD
add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; inptr
add edi, byte SIZEOF_MMWORD ; outptr0
cmp ecx, byte SIZEOF_MMWORD
jae near .columnloop
test ecx, ecx
jnz near .column_ld1
pop ecx ; col
pop esi
pop edi
poppic eax
add esi, byte SIZEOF_JSAMPROW ; input_buf
add edi, byte SIZEOF_JSAMPROW
dec eax ; num_rows
jg near .rowloop
emms ; empty MMX state
.return:
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 32

@ -0,0 +1,382 @@
;
; jcgryext.asm - grayscale colorspace conversion (SSE2)
;
; Copyright (C) 2011, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
%include "jcolsamp.inc"
; --------------------------------------------------------------------------
;
; Convert some rows of samples to the output colorspace.
;
; GLOBAL(void)
; jsimd_rgb_gray_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
; JSAMPIMAGE output_buf, JDIMENSION output_row,
; int num_rows);
;
%define img_width(b) (b) + 8 ; JDIMENSION img_width
%define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf
%define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf
%define output_row(b) (b) + 20 ; JDIMENSION output_row
%define num_rows(b) (b) + 24 ; int num_rows
%define original_ebp ebp + 0
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
; xmmword wk[WK_NUM]
%define WK_NUM 2
%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
align 32
GLOBAL_FUNCTION(jsimd_rgb_gray_convert_sse2)
EXTN(jsimd_rgb_gray_convert_sse2):
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic eax ; make a room for GOT address
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
get_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [img_width(eax)]
test ecx, ecx
jz near .return
push ecx
mov esi, JSAMPIMAGE [output_buf(eax)]
mov ecx, JDIMENSION [output_row(eax)]
mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
lea edi, [edi+ecx*SIZEOF_JSAMPROW]
pop ecx
mov esi, JSAMPARRAY [input_buf(eax)]
mov eax, INT [num_rows(eax)]
test eax, eax
jle near .return
alignx 16, 7
.rowloop:
pushpic eax
push edi
push esi
push ecx ; col
mov esi, JSAMPROW [esi] ; inptr
mov edi, JSAMPROW [edi] ; outptr0
movpic eax, POINTER [gotptr] ; load GOT address (eax)
cmp ecx, byte SIZEOF_XMMWORD
jae near .columnloop
alignx 16, 7
%if RGB_PIXELSIZE == 3 ; ---------------
.column_ld1:
push eax
push edx
lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
test cl, SIZEOF_BYTE
jz short .column_ld2
sub ecx, byte SIZEOF_BYTE
movzx eax, byte [esi+ecx]
.column_ld2:
test cl, SIZEOF_WORD
jz short .column_ld4
sub ecx, byte SIZEOF_WORD
movzx edx, word [esi+ecx]
shl eax, WORD_BIT
or eax, edx
.column_ld4:
movd xmmA, eax
pop edx
pop eax
test cl, SIZEOF_DWORD
jz short .column_ld8
sub ecx, byte SIZEOF_DWORD
movd xmmF, XMM_DWORD [esi+ecx]
pslldq xmmA, SIZEOF_DWORD
por xmmA, xmmF
.column_ld8:
test cl, SIZEOF_MMWORD
jz short .column_ld16
sub ecx, byte SIZEOF_MMWORD
movq xmmB, XMM_MMWORD [esi+ecx]
pslldq xmmA, SIZEOF_MMWORD
por xmmA, xmmB
.column_ld16:
test cl, SIZEOF_XMMWORD
jz short .column_ld32
movdqa xmmF, xmmA
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
mov ecx, SIZEOF_XMMWORD
jmp short .rgb_gray_cnv
.column_ld32:
test cl, 2*SIZEOF_XMMWORD
mov ecx, SIZEOF_XMMWORD
jz short .rgb_gray_cnv
movdqa xmmB, xmmA
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
jmp short .rgb_gray_cnv
alignx 16, 7
.columnloop:
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
.rgb_gray_cnv:
; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
movdqa xmmG, xmmA
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
movdqa xmmD, xmmA
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
movdqa xmmE, xmmA
pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
pxor xmmH, xmmH
movdqa xmmC, xmmA
punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmB, xmmE
punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
movdqa xmmF, xmmD
punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
%else ; RGB_PIXELSIZE == 4 ; -----------
.column_ld1:
test cl, SIZEOF_XMMWORD/16
jz short .column_ld2
sub ecx, byte SIZEOF_XMMWORD/16
movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
.column_ld2:
test cl, SIZEOF_XMMWORD/8
jz short .column_ld4
sub ecx, byte SIZEOF_XMMWORD/8
movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
pslldq xmmA, SIZEOF_MMWORD
por xmmA, xmmE
.column_ld4:
test cl, SIZEOF_XMMWORD/4
jz short .column_ld8
sub ecx, byte SIZEOF_XMMWORD/4
movdqa xmmE, xmmA
movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
.column_ld8:
test cl, SIZEOF_XMMWORD/2
mov ecx, SIZEOF_XMMWORD
jz short .rgb_gray_cnv
movdqa xmmF, xmmA
movdqa xmmH, xmmE
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
jmp short .rgb_gray_cnv
alignx 16, 7
.columnloop:
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
.rgb_gray_cnv:
; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
movdqa xmmD, xmmA
punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
movdqa xmmC, xmmF
punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
movdqa xmmB, xmmA
punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
movdqa xmmG, xmmD
punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
movdqa xmmE, xmmA
punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
movdqa xmmH, xmmB
punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
pxor xmmF, xmmF
movdqa xmmC, xmmA
punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
movdqa xmmD, xmmB
punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
movdqa xmmG, xmmE
punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
punpcklbw xmmF, xmmH
punpckhbw xmmH, xmmH
psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
%endif ; RGB_PIXELSIZE ; ---------------
; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
; (Original)
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
;
; (This implementation)
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
movdqa xmm6, xmm1
punpcklwd xmm1, xmm3
punpckhwd xmm6, xmm3
pmaddwd xmm1, [GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
movdqa xmm6, xmm0
punpcklwd xmm0, xmm2
punpckhwd xmm6, xmm2
pmaddwd xmm0, [GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
pmaddwd xmm6, [GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
movdqa xmm0, xmm5 ; xmm0=BO
movdqa xmm6, xmm4 ; xmm6=BE
movdqa xmm4, xmm0
punpcklwd xmm0, xmm3
punpckhwd xmm4, xmm3
pmaddwd xmm0, [GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
movdqa xmm3, [GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
paddd xmm0, xmm1
paddd xmm4, xmm7
paddd xmm0, xmm3
paddd xmm4, xmm3
psrld xmm0, SCALEBITS ; xmm0=YOL
psrld xmm4, SCALEBITS ; xmm4=YOH
packssdw xmm0, xmm4 ; xmm0=YO
movdqa xmm4, xmm6
punpcklwd xmm6, xmm2
punpckhwd xmm4, xmm2
pmaddwd xmm6, [GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
pmaddwd xmm4, [GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
movdqa xmm2, [GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
paddd xmm6, XMMWORD [wk(0)]
paddd xmm4, XMMWORD [wk(1)]
paddd xmm6, xmm2
paddd xmm4, xmm2
psrld xmm6, SCALEBITS ; xmm6=YEL
psrld xmm4, SCALEBITS ; xmm4=YEH
packssdw xmm6, xmm4 ; xmm6=YE
psllw xmm0, BYTE_BIT
por xmm6, xmm0 ; xmm6=Y
movdqa XMMWORD [edi], xmm6 ; Save Y
sub ecx, byte SIZEOF_XMMWORD
add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
add edi, byte SIZEOF_XMMWORD ; outptr0
cmp ecx, byte SIZEOF_XMMWORD
jae near .columnloop
test ecx, ecx
jnz near .column_ld1
pop ecx ; col
pop esi
pop edi
poppic eax
add esi, byte SIZEOF_JSAMPROW ; input_buf
add edi, byte SIZEOF_JSAMPROW
dec eax ; num_rows
jg near .rowloop
.return:
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 32

@ -0,0 +1,761 @@
;
; jchuff-sse2.asm - Huffman entropy encoding (SSE2)
;
; Copyright (C) 2009-2011, 2014-2017, 2019, D. R. Commander.
; Copyright (C) 2015, Matthieu Darbois.
; Copyright (C) 2018, Matthias Räncker.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
; This file contains an SSE2 implementation for Huffman coding of one block.
; The following code is based on jchuff.c; see jchuff.c for more details.
%include "jsimdext.inc"
struc working_state
.next_output_byte: resp 1 ; => next byte to write in buffer
.free_in_buffer: resp 1 ; # of byte spaces remaining in buffer
.cur.put_buffer.simd resq 1 ; current bit accumulation buffer
.cur.free_bits resd 1 ; # of bits available in it
.cur.last_dc_val resd 4 ; last DC coef for each component
.cinfo: resp 1 ; dump_buffer needs access to this
endstruc
struc c_derived_tbl
.ehufco: resd 256 ; code for each symbol
.ehufsi: resb 256 ; length of code for each symbol
; If no code has been allocated for a symbol S, ehufsi[S] contains 0
endstruc
; --------------------------------------------------------------------------
SECTION SEG_CONST
GLOBAL_DATA(jconst_huff_encode_one_block)
EXTN(jconst_huff_encode_one_block):
alignz 32
jpeg_mask_bits dq 0x0000, 0x0001, 0x0003, 0x0007
dq 0x000f, 0x001f, 0x003f, 0x007f
dq 0x00ff, 0x01ff, 0x03ff, 0x07ff
dq 0x0fff, 0x1fff, 0x3fff, 0x7fff
times 1 << 14 db 15
times 1 << 13 db 14
times 1 << 12 db 13
times 1 << 11 db 12
times 1 << 10 db 11
times 1 << 9 db 10
times 1 << 8 db 9
times 1 << 7 db 8
times 1 << 6 db 7
times 1 << 5 db 6
times 1 << 4 db 5
times 1 << 3 db 4
times 1 << 2 db 3
times 1 << 1 db 2
times 1 << 0 db 1
times 1 db 0
jpeg_nbits_table:
times 1 db 0
times 1 << 0 db 1
times 1 << 1 db 2
times 1 << 2 db 3
times 1 << 3 db 4
times 1 << 4 db 5
times 1 << 5 db 6
times 1 << 6 db 7
times 1 << 7 db 8
times 1 << 8 db 9
times 1 << 9 db 10
times 1 << 10 db 11
times 1 << 11 db 12
times 1 << 12 db 13
times 1 << 13 db 14
times 1 << 14 db 15
alignz 32
%ifdef PIC
%define NBITS(x) nbits_base + x
%else
%define NBITS(x) jpeg_nbits_table + x
%endif
%define MASK_BITS(x) NBITS((x) * 8) + (jpeg_mask_bits - jpeg_nbits_table)
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
%define mm_put_buffer mm0
%define mm_all_0xff mm1
%define mm_temp mm2
%define mm_nbits mm3
%define mm_code_bits mm3
%define mm_code mm4
%define mm_overflow_bits mm5
%define mm_save_nbits mm6
; Shorthand used to describe SIMD operations:
; wN: xmmN treated as eight signed 16-bit values
; wN[i]: perform the same operation on all eight signed 16-bit values, i=0..7
; bN: xmmN treated as 16 unsigned 8-bit values, or
; mmN treated as eight unsigned 8-bit values
; bN[i]: perform the same operation on all unsigned 8-bit values,
; i=0..15 (SSE register) or i=0..7 (MMX register)
; Contents of SIMD registers are shown in memory order.
; Fill the bit buffer to capacity with the leading bits from code, then output
; the bit buffer and put the remaining bits from code into the bit buffer.
;
; Usage:
; code - contains the bits to shift into the bit buffer (LSB-aligned)
; %1 - temp register
; %2 - low byte of temp register
; %3 - second byte of temp register
; %4-%8 (optional) - extra instructions to execute before the macro completes
; %9 - the label to which to jump when the macro completes
;
; Upon completion, free_bits will be set to the number of remaining bits from
; code, and put_buffer will contain those remaining bits. temp and code will
; be clobbered.
;
; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE()
; macro in jchuff.c.
%macro EMIT_QWORD 9
%define %%temp %1
%define %%tempb %2
%define %%temph %3
add nbits, free_bits ; nbits += free_bits;
neg free_bits ; free_bits = -free_bits;
movq mm_temp, mm_code ; temp = code;
movd mm_nbits, nbits ; nbits --> MMX register
movd mm_overflow_bits, free_bits ; overflow_bits (temp register) = free_bits;
neg free_bits ; free_bits = -free_bits;
psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
psrlq mm_temp, mm_overflow_bits ; temp >>= overflow_bits;
add free_bits, 64 ; free_bits += 64;
por mm_temp, mm_put_buffer ; temp |= put_buffer;
%ifidn %%temp, nbits_base
movd mm_save_nbits, nbits_base ; save nbits_base
%endif
movq mm_code_bits, mm_temp ; code_bits (temp register) = temp;
movq mm_put_buffer, mm_code ; put_buffer = code;
pcmpeqb mm_temp, mm_all_0xff ; b_temp[i] = (b_temp[i] == 0xFF ? 0xFF : 0);
movq mm_code, mm_code_bits ; code = code_bits;
psrlq mm_code_bits, 32 ; code_bits >>= 32;
pmovmskb nbits, mm_temp ; nbits = 0; nbits |= ((b_temp[i] >> 7) << i);
movd %%temp, mm_code_bits ; temp = code_bits;
bswap %%temp ; temp = htonl(temp);
test nbits, nbits ; if (nbits != 0) /* Some 0xFF bytes */
jnz %%.SLOW ; goto %%.SLOW
mov dword [buffer], %%temp ; *(uint32_t)buffer = temp;
%ifidn %%temp, nbits_base
movd nbits_base, mm_save_nbits ; restore nbits_base
%endif
%4
movd nbits, mm_code ; nbits = (uint32_t)(code);
%5
bswap nbits ; nbits = htonl(nbits);
mov dword [buffer + 4], nbits ; *(uint32_t)(buffer + 4) = nbits;
lea buffer, [buffer + 8] ; buffer += 8;
%6
%7
%8
jmp %9 ; return
%%.SLOW:
; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8
; bytes in the qword.
mov byte [buffer], %%tempb ; buffer[0] = temp[0];
cmp %%tempb, 0xFF ; Set CF if temp[0] < 0xFF
mov byte [buffer+1], 0 ; buffer[1] = 0;
sbb buffer, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
mov byte [buffer], %%temph ; buffer[0] = temp[1];
cmp %%temph, 0xFF ; Set CF if temp[1] < 0xFF
mov byte [buffer+1], 0 ; buffer[1] = 0;
sbb buffer, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
shr %%temp, 16 ; temp >>= 16;
mov byte [buffer], %%tempb ; buffer[0] = temp[0];
cmp %%tempb, 0xFF ; Set CF if temp[0] < 0xFF
mov byte [buffer+1], 0 ; buffer[1] = 0;
sbb buffer, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
mov byte [buffer], %%temph ; buffer[0] = temp[1];
cmp %%temph, 0xFF ; Set CF if temp[1] < 0xFF
mov byte [buffer+1], 0 ; buffer[1] = 0;
sbb buffer, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
movd nbits, mm_code ; nbits (temp register) = (uint32_t)(code)
%ifidn %%temp, nbits_base
movd nbits_base, mm_save_nbits ; restore nbits_base
%endif
bswap nbits ; nbits = htonl(nbits)
mov byte [buffer], nbitsb ; buffer[0] = nbits[0];
cmp nbitsb, 0xFF ; Set CF if nbits[0] < 0xFF
mov byte [buffer+1], 0 ; buffer[1] = 0;
sbb buffer, -2 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0));
mov byte [buffer], nbitsh ; buffer[0] = nbits[1];
cmp nbitsh, 0xFF ; Set CF if nbits[1] < 0xFF
mov byte [buffer+1], 0 ; buffer[1] = 0;
sbb buffer, -2 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0));
shr nbits, 16 ; nbits >>= 16;
mov byte [buffer], nbitsb ; buffer[0] = nbits[0];
cmp nbitsb, 0xFF ; Set CF if nbits[0] < 0xFF
mov byte [buffer+1], 0 ; buffer[1] = 0;
sbb buffer, -2 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0));
mov byte [buffer], nbitsh ; buffer[0] = nbits[1];
%4
cmp nbitsh, 0xFF ; Set CF if nbits[1] < 0xFF
mov byte [buffer+1], 0 ; buffer[1] = 0;
sbb buffer, -2 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0));
%5
%6
%7
%8
jmp %9 ; return;
%endmacro
%macro PUSH 1
push %1
%assign stack_offset stack_offset + 4
%endmacro
%macro POP 1
pop %1
%assign stack_offset stack_offset - 4
%endmacro
; If PIC is defined, load the address of a symbol defined in this file into a
; register. Equivalent to
; get_GOT %1
; lea %1, [GOTOFF(%1, %2)]
; without using the GOT.
;
; Usage:
; %1 - register into which to load the address of the symbol
; %2 - symbol whose address should be loaded
; %3 - optional multi-line macro to execute before the symbol address is loaded
; %4 - optional multi-line macro to execute after the symbol address is loaded
;
; If PIC is not defined, then %3 and %4 are executed in order.
%macro GET_SYM 2-4
%ifdef PIC
call %%.geteip
%%.ref:
%4
add %1, %2 - %%.ref
jmp short %%.done
align 32
%%.geteip:
%3 4 ; must adjust stack pointer because of call
mov %1, POINTER [esp]
ret
align 32
%%.done:
%else
%3 0
%4
%endif
%endmacro
;
; Encode a single block's worth of coefficients.
;
; GLOBAL(JOCTET *)
; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer,
; JCOEFPTR block, int last_dc_val,
; c_derived_tbl *dctbl, c_derived_tbl *actbl)
;
; Stack layout:
; Function args
; Return address
; Saved ebx
; Saved ebp
; Saved esi
; Saved edi <-- esp_save
; ...
; esp_save
; t_ 64*2 bytes (aligned to 128 bytes)
;
; esp is used (as t) to point into t_ (data in lower indices is not used once
; esp passes over them, so this is signal-safe.) Aligning to 128 bytes allows
; us to find the rest of the data again.
;
; NOTES:
; When shuffling data, we try to avoid pinsrw as much as possible, since it is
; slow on many CPUs. Its reciprocal throughput (issue latency) is 1 even on
; modern CPUs, so chains of pinsrw instructions (even with different outputs)
; can limit performance. pinsrw is a VectorPath instruction on AMD K8 and
; requires 2 µops (with memory operand) on Intel. In either case, only one
; pinsrw instruction can be decoded per cycle (and nothing else if they are
; back-to-back), so out-of-order execution cannot be used to work around long
; pinsrw chains (though for Sandy Bridge and later, this may be less of a
; problem if the code runs from the µop cache.)
;
; We use tzcnt instead of bsf without checking for support. The instruction is
; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to
; rep bsf.) The destination (first) operand of bsf (and tzcnt on some CPUs) is
; an input dependency (although the behavior is not formally defined, Intel
; CPUs usually leave the destination unmodified if the source is zero.) This
; can prevent out-of-order execution, so we clear the destination before
; invoking tzcnt.
;
; Initial register allocation
; eax - frame --> buffer
; ebx - nbits_base (PIC) / emit_temp
; ecx - dctbl --> size --> state
; edx - block --> nbits
; esi - code_temp --> state --> actbl
; edi - index_temp --> free_bits
; esp - t
; ebp - index
%define frame eax
%ifdef PIC
%define nbits_base ebx
%endif
%define emit_temp ebx
%define emit_tempb bl
%define emit_temph bh
%define dctbl ecx
%define block edx
%define code_temp esi
%define index_temp edi
%define t esp
%define index ebp
%assign save_frame DCTSIZE2 * SIZEOF_WORD
; Step 1: Re-arrange input data according to jpeg_natural_order
; xx 01 02 03 04 05 06 07 xx 01 08 16 09 02 03 10
; 08 09 10 11 12 13 14 15 17 24 32 25 18 11 04 05
; 16 17 18 19 20 21 22 23 12 19 26 33 40 48 41 34
; 24 25 26 27 28 29 30 31 ==> 27 20 13 06 07 14 21 28
; 32 33 34 35 36 37 38 39 35 42 49 56 57 50 43 36
; 40 41 42 43 44 45 46 47 29 22 15 23 30 37 44 51
; 48 49 50 51 52 53 54 55 58 59 52 45 38 31 39 46
; 56 57 58 59 60 61 62 63 53 60 61 54 47 55 62 63
align 32
GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
EXTN(jsimd_huff_encode_one_block_sse2):
%assign stack_offset 0
%define arg_state 4 + stack_offset
%define arg_buffer 8 + stack_offset
%define arg_block 12 + stack_offset
%define arg_last_dc_val 16 + stack_offset
%define arg_dctbl 20 + stack_offset
%define arg_actbl 24 + stack_offset
;X: X = code stream
mov block, [esp + arg_block]
PUSH ebx
PUSH ebp
movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07
PUSH esi
PUSH edi
movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07
mov frame, esp
lea t, [frame - (save_frame + 4)]
movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15
and t, -DCTSIZE2 * SIZEOF_WORD ; t = &t_[0]
mov [t + save_frame], frame
pxor xmm4, xmm4 ;A: w4[i] = 0;
punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11
pshuflw xmm0, xmm0, 11001001b ;A: w0 = 01 08 xx 09 02 03 10 11
pinsrw xmm0, word [block + 16 * SIZEOF_WORD], 2 ;A: w0 = 01 08 16 09 02 03 10 11
punpckhdq xmm3, xmm1 ;D: w3 = 04 05 12 13 06 07 14 15
punpcklqdq xmm1, xmm3 ;B: w1 = 08 09 10 11 04 05 12 13
pinsrw xmm0, word [block + 17 * SIZEOF_WORD], 7 ;A: w0 = 01 08 16 09 02 03 10 17
;A: (Row 0, offset 1)
pcmpgtw xmm4, xmm0 ;A: w4[i] = (w0[i] < 0 ? -1 : 0);
paddw xmm0, xmm4 ;A: w0[i] += w4[i];
movaps XMMWORD [t + 0 * SIZEOF_WORD], xmm0 ;A: t[i] = w0[i];
movq xmm2, qword [block + 24 * SIZEOF_WORD] ;B: w2 = 24 25 26 27 -- -- -- --
pshuflw xmm2, xmm2, 11011000b ;B: w2 = 24 26 25 27 -- -- -- --
pslldq xmm1, 1 * SIZEOF_WORD ;B: w1 = -- 08 09 10 11 04 05 12
movups xmm5, XMMWORD [block + 48 * SIZEOF_WORD] ;H: w5 = 48 49 50 51 52 53 54 55
movsd xmm1, xmm2 ;B: w1 = 24 26 25 27 11 04 05 12
punpcklqdq xmm2, xmm5 ;C: w2 = 24 26 25 27 48 49 50 51
pinsrw xmm1, word [block + 32 * SIZEOF_WORD], 1 ;B: w1 = 24 32 25 27 11 04 05 12
pxor xmm4, xmm4 ;A: w4[i] = 0;
psrldq xmm3, 2 * SIZEOF_WORD ;D: w3 = 12 13 06 07 14 15 -- --
pcmpeqw xmm0, xmm4 ;A: w0[i] = (w0[i] == 0 ? -1 : 0);
pinsrw xmm1, word [block + 18 * SIZEOF_WORD], 3 ;B: w1 = 24 32 25 18 11 04 05 12
; (Row 1, offset 1)
pcmpgtw xmm4, xmm1 ;B: w4[i] = (w1[i] < 0 ? -1 : 0);
paddw xmm1, xmm4 ;B: w1[i] += w4[i];
movaps XMMWORD [t + 8 * SIZEOF_WORD], xmm1 ;B: t[i+8] = w1[i];
pxor xmm4, xmm4 ;B: w4[i] = 0;
pcmpeqw xmm1, xmm4 ;B: w1[i] = (w1[i] == 0 ? -1 : 0);
packsswb xmm0, xmm1 ;AB: b0[i] = w0[i], b0[i+8] = w1[i]
; w/ signed saturation
pinsrw xmm3, word [block + 20 * SIZEOF_WORD], 0 ;D: w3 = 20 13 06 07 14 15 -- --
pinsrw xmm3, word [block + 21 * SIZEOF_WORD], 5 ;D: w3 = 20 13 06 07 14 21 -- --
pinsrw xmm3, word [block + 28 * SIZEOF_WORD], 6 ;D: w3 = 20 13 06 07 14 21 28 --
pinsrw xmm3, word [block + 35 * SIZEOF_WORD], 7 ;D: w3 = 20 13 06 07 14 21 28 35
; (Row 3, offset 1)
pcmpgtw xmm4, xmm3 ;D: w4[i] = (w3[i] < 0 ? -1 : 0);
paddw xmm3, xmm4 ;D: w3[i] += w4[i];
movaps XMMWORD [t + 24 * SIZEOF_WORD], xmm3 ;D: t[i+24] = w3[i];
pxor xmm4, xmm4 ;D: w4[i] = 0;
pcmpeqw xmm3, xmm4 ;D: w3[i] = (w3[i] == 0 ? -1 : 0);
pinsrw xmm2, word [block + 19 * SIZEOF_WORD], 0 ;C: w2 = 19 26 25 27 48 49 50 51
pinsrw xmm2, word [block + 33 * SIZEOF_WORD], 2 ;C: w2 = 19 26 33 27 48 49 50 51
pinsrw xmm2, word [block + 40 * SIZEOF_WORD], 3 ;C: w2 = 19 26 33 40 48 49 50 51
pinsrw xmm2, word [block + 41 * SIZEOF_WORD], 5 ;C: w2 = 19 26 33 40 48 41 50 51
pinsrw xmm2, word [block + 34 * SIZEOF_WORD], 6 ;C: w2 = 19 26 33 40 48 41 34 51
pinsrw xmm2, word [block + 27 * SIZEOF_WORD], 7 ;C: w2 = 19 26 33 40 48 41 34 27
; (Row 2, offset 1)
pcmpgtw xmm4, xmm2 ;C: w4[i] = (w2[i] < 0 ? -1 : 0);
paddw xmm2, xmm4 ;C: w2[i] += w4[i];
movsx code_temp, word [block] ;Z: code_temp = block[0];
; %1 - stack pointer adjustment
%macro GET_SYM_BEFORE 1
movaps XMMWORD [t + 16 * SIZEOF_WORD + %1], xmm2
;C: t[i+16] = w2[i];
pxor xmm4, xmm4 ;C: w4[i] = 0;
pcmpeqw xmm2, xmm4 ;C: w2[i] = (w2[i] == 0 ? -1 : 0);
sub code_temp, [frame + arg_last_dc_val] ;Z: code_temp -= last_dc_val;
packsswb xmm2, xmm3 ;CD: b2[i] = w2[i], b2[i+8] = w3[i]
; w/ signed saturation
movdqa xmm3, xmm5 ;H: w3 = 48 49 50 51 52 53 54 55
pmovmskb index_temp, xmm2 ;Z: index_temp = 0; index_temp |= ((b2[i] >> 7) << i);
pmovmskb index, xmm0 ;Z: index = 0; index |= ((b0[i] >> 7) << i);
movups xmm0, XMMWORD [block + 56 * SIZEOF_WORD] ;H: w0 = 56 57 58 59 60 61 62 63
punpckhdq xmm3, xmm0 ;H: w3 = 52 53 60 61 54 55 62 63
shl index_temp, 16 ;Z: index_temp <<= 16;
psrldq xmm3, 1 * SIZEOF_WORD ;H: w3 = 53 60 61 54 55 62 63 --
pxor xmm2, xmm2 ;H: w2[i] = 0;
pshuflw xmm3, xmm3, 00111001b ;H: w3 = 60 61 54 53 55 62 63 --
or index, index_temp ;Z: index |= index_temp;
%undef index_temp
%define free_bits edi
%endmacro
%macro GET_SYM_AFTER 0
movq xmm1, qword [block + 44 * SIZEOF_WORD] ;G: w1 = 44 45 46 47 -- -- -- --
unpcklps xmm5, xmm0 ;E: w5 = 48 49 56 57 50 51 58 59
pxor xmm0, xmm0 ;H: w0[i] = 0;
not index ;Z: index = ~index;
pinsrw xmm3, word [block + 47 * SIZEOF_WORD], 3 ;H: w3 = 60 61 54 47 55 62 63 --
; (Row 7, offset 1)
pcmpgtw xmm2, xmm3 ;H: w2[i] = (w3[i] < 0 ? -1 : 0);
mov dctbl, [frame + arg_dctbl]
paddw xmm3, xmm2 ;H: w3[i] += w2[i];
movaps XMMWORD [t + 56 * SIZEOF_WORD], xmm3 ;H: t[i+56] = w3[i];
movq xmm4, qword [block + 36 * SIZEOF_WORD] ;G: w4 = 36 37 38 39 -- -- -- --
pcmpeqw xmm3, xmm0 ;H: w3[i] = (w3[i] == 0 ? -1 : 0);
punpckldq xmm4, xmm1 ;G: w4 = 36 37 44 45 38 39 46 47
movdqa xmm1, xmm4 ;F: w1 = 36 37 44 45 38 39 46 47
pcmpeqw mm_all_0xff, mm_all_0xff ;Z: all_0xff[i] = 0xFF;
%endmacro
GET_SYM nbits_base, jpeg_nbits_table, GET_SYM_BEFORE, GET_SYM_AFTER
psrldq xmm4, 1 * SIZEOF_WORD ;G: w4 = 37 44 45 38 39 46 47 --
shufpd xmm1, xmm5, 10b ;F: w1 = 36 37 44 45 50 51 58 59
pshufhw xmm4, xmm4, 11010011b ;G: w4 = 37 44 45 38 -- 39 46 --
pslldq xmm1, 1 * SIZEOF_WORD ;F: w1 = -- 36 37 44 45 50 51 58
pinsrw xmm4, word [block + 59 * SIZEOF_WORD], 0 ;G: w4 = 59 44 45 38 -- 39 46 --
pshufd xmm1, xmm1, 11011000b ;F: w1 = -- 36 45 50 37 44 51 58
cmp code_temp, 1 << 31 ;Z: Set CF if code_temp < 0x80000000,
;Z: i.e. if code_temp is positive
pinsrw xmm4, word [block + 52 * SIZEOF_WORD], 1 ;G: w4 = 59 52 45 38 -- 39 46 --
movlps xmm1, qword [block + 20 * SIZEOF_WORD] ;F: w1 = 20 21 22 23 37 44 51 58
pinsrw xmm4, word [block + 31 * SIZEOF_WORD], 4 ;G: w4 = 59 52 45 38 31 39 46 --
pshuflw xmm1, xmm1, 01110010b ;F: w1 = 22 20 23 21 37 44 51 58
pinsrw xmm4, word [block + 53 * SIZEOF_WORD], 7 ;G: w4 = 59 52 45 38 31 39 46 53
; (Row 6, offset 1)
adc code_temp, -1 ;Z: code_temp += -1 + (code_temp >= 0 ? 1 : 0);
pxor xmm2, xmm2 ;G: w2[i] = 0;
pcmpgtw xmm0, xmm4 ;G: w0[i] = (w4[i] < 0 ? -1 : 0);
pinsrw xmm1, word [block + 15 * SIZEOF_WORD], 1 ;F: w1 = 22 15 23 21 37 44 51 58
paddw xmm4, xmm0 ;G: w4[i] += w0[i];
movaps XMMWORD [t + 48 * SIZEOF_WORD], xmm4 ;G: t[48+i] = w4[i];
movd mm_temp, code_temp ;Z: temp = code_temp
pinsrw xmm1, word [block + 30 * SIZEOF_WORD], 3 ;F: w1 = 22 15 23 30 37 44 51 58
; (Row 5, offset 1)
pcmpeqw xmm4, xmm2 ;G: w4[i] = (w4[i] == 0 ? -1 : 0);
packsswb xmm4, xmm3 ;GH: b4[i] = w4[i], b4[i+8] = w3[i]
; w/ signed saturation
lea t, [t - SIZEOF_WORD] ;Z: t = &t[-1]
pxor xmm0, xmm0 ;F: w0[i] = 0;
pcmpgtw xmm2, xmm1 ;F: w2[i] = (w1[i] < 0 ? -1 : 0);
paddw xmm1, xmm2 ;F: w1[i] += w2[i];
movaps XMMWORD [t + (40+1) * SIZEOF_WORD], xmm1 ;F: t[40+i] = w1[i];
pcmpeqw xmm1, xmm0 ;F: w1[i] = (w1[i] == 0 ? -1 : 0);
pinsrw xmm5, word [block + 42 * SIZEOF_WORD], 0 ;E: w5 = 42 49 56 57 50 51 58 59
pinsrw xmm5, word [block + 43 * SIZEOF_WORD], 5 ;E: w5 = 42 49 56 57 50 43 58 59
pinsrw xmm5, word [block + 36 * SIZEOF_WORD], 6 ;E: w5 = 42 49 56 57 50 43 36 59
pinsrw xmm5, word [block + 29 * SIZEOF_WORD], 7 ;E: w5 = 42 49 56 57 50 43 36 29
; (Row 4, offset 1)
%undef block
%define nbits edx
%define nbitsb dl
%define nbitsh dh
movzx nbits, byte [NBITS(code_temp)] ;Z: nbits = JPEG_NBITS(code_temp);
%undef code_temp
%define state esi
pxor xmm2, xmm2 ;E: w2[i] = 0;
mov state, [frame + arg_state]
movd mm_nbits, nbits ;Z: nbits --> MMX register
pcmpgtw xmm0, xmm5 ;E: w0[i] = (w5[i] < 0 ? -1 : 0);
movd mm_code, dword [dctbl + c_derived_tbl.ehufco + nbits * 4]
;Z: code = dctbl->ehufco[nbits];
%define size ecx
%define sizeb cl
%define sizeh ch
paddw xmm5, xmm0 ;E: w5[i] += w0[i];
movaps XMMWORD [t + (32+1) * SIZEOF_WORD], xmm5 ;E: t[32+i] = w5[i];
movzx size, byte [dctbl + c_derived_tbl.ehufsi + nbits]
;Z: size = dctbl->ehufsi[nbits];
%undef dctbl
pcmpeqw xmm5, xmm2 ;E: w5[i] = (w5[i] == 0 ? -1 : 0);
packsswb xmm5, xmm1 ;EF: b5[i] = w5[i], b5[i+8] = w1[i]
; w/ signed saturation
movq mm_put_buffer, [state + working_state.cur.put_buffer.simd]
;Z: put_buffer = state->cur.put_buffer.simd;
mov free_bits, [state + working_state.cur.free_bits]
;Z: free_bits = state->cur.free_bits;
%undef state
%define actbl esi
mov actbl, [frame + arg_actbl]
%define buffer eax
mov buffer, [frame + arg_buffer]
%undef frame
jmp .BEGIN
; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
align 16
; size <= 32, so this is not really a loop
.BRLOOP1: ; .BRLOOP1:
movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
; nbits = actbl->ehufsi[0xf0];
movd mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
; code = actbl->ehufco[0xf0];
and index, 0x7ffffff ; clear index if size == 32
sub size, 16 ; size -= 16;
sub free_bits, nbits ; if ((free_bits -= nbits) <= 0)
jle .EMIT_BRLOOP1 ; goto .EMIT_BRLOOP1;
movd mm_nbits, nbits ; nbits --> MMX register
psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
por mm_put_buffer, mm_code ; put_buffer |= code;
jmp .ERLOOP1 ; goto .ERLOOP1;
; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
align 16
%ifdef PIC
times 6 nop
%else
times 2 nop
%endif
.BLOOP1: ; do { /* size = # of zero bits/elements to skip */
; if size == 32, index remains unchanged. Correct in .BRLOOP.
shr index, sizeb ; index >>= size;
lea t, [t + size * SIZEOF_WORD] ; t += size;
cmp size, 16 ; if (size > 16)
jg .BRLOOP1 ; goto .BRLOOP1;
.ERLOOP1: ; .ERLOOP1:
movsx nbits, word [t] ; nbits = *t;
%ifdef PIC
add size, size ; size += size;
%else
lea size, [size * 2] ; size += size;
%endif
movd mm_temp, nbits ; temp = nbits;
movzx nbits, byte [NBITS(nbits)] ; nbits = JPEG_NBITS(nbits);
lea size, [size * 8 + nbits] ; size = size * 8 + nbits;
movd mm_nbits, nbits ; nbits --> MMX register
movd mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4]
; code = actbl->ehufco[size-16];
movzx size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)]
; size = actbl->ehufsi[size-16];
.BEGIN: ; .BEGIN:
pand mm_temp, [MASK_BITS(nbits)] ; temp &= (1 << nbits) - 1;
psllq mm_code, mm_nbits ; code <<= nbits;
add nbits, size ; nbits += size;
por mm_code, mm_temp ; code |= temp;
sub free_bits, nbits ; if ((free_bits -= nbits) <= 0)
jle .EMIT_ERLOOP1 ; insert code, flush buffer, init size, goto .BLOOP1
xor size, size ; size = 0; /* kill tzcnt input dependency */
tzcnt size, index ; size = # of trailing 0 bits in index
movd mm_nbits, nbits ; nbits --> MMX register
psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
inc size ; ++size;
por mm_put_buffer, mm_code ; put_buffer |= code;
test index, index
jnz .BLOOP1 ; } while (index != 0);
; Round 2
; t points to the last used word, possibly below t_ if the previous index had 32 zero bits.
.ELOOP1: ; .ELOOP1:
pmovmskb size, xmm4 ; size = 0; size |= ((b4[i] >> 7) << i);
pmovmskb index, xmm5 ; index = 0; index |= ((b5[i] >> 7) << i);
shl size, 16 ; size <<= 16;
or index, size ; index |= size;
not index ; index = ~index;
lea nbits, [t + (1 + DCTSIZE2) * SIZEOF_WORD]
; nbits = t + 1 + 64;
and nbits, -DCTSIZE2 * SIZEOF_WORD ; nbits &= -128; /* now points to &t_[64] */
sub nbits, t ; nbits -= t;
shr nbits, 1 ; nbits >>= 1; /* # of leading 0 bits in old index + 33 */
tzcnt size, index ; size = # of trailing 0 bits in index
inc size ; ++size;
test index, index ; if (index == 0)
jz .ELOOP2 ; goto .ELOOP2;
; NOTE: size == 32 cannot happen, since the last element is always 0.
shr index, sizeb ; index >>= size;
lea size, [size + nbits - 33] ; size = size + nbits - 33;
lea t, [t + size * SIZEOF_WORD] ; t += size;
cmp size, 16 ; if (size <= 16)
jle .ERLOOP2 ; goto .ERLOOP2;
.BRLOOP2: ; do {
movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
; nbits = actbl->ehufsi[0xf0];
sub size, 16 ; size -= 16;
movd mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
; code = actbl->ehufco[0xf0];
sub free_bits, nbits ; if ((free_bits -= nbits) <= 0)
jle .EMIT_BRLOOP2 ; insert code and flush put_buffer
movd mm_nbits, nbits ; else { nbits --> MMX register
psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
por mm_put_buffer, mm_code ; put_buffer |= code;
cmp size, 16 ; if (size <= 16)
jle .ERLOOP2 ; goto .ERLOOP2;
jmp .BRLOOP2 ; } while (1);
; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
align 16
.BLOOP2: ; do { /* size = # of zero bits/elements to skip */
shr index, sizeb ; index >>= size;
lea t, [t + size * SIZEOF_WORD] ; t += size;
cmp size, 16 ; if (size > 16)
jg .BRLOOP2 ; goto .BRLOOP2;
.ERLOOP2: ; .ERLOOP2:
movsx nbits, word [t] ; nbits = *t;
add size, size ; size += size;
movd mm_temp, nbits ; temp = nbits;
movzx nbits, byte [NBITS(nbits)] ; nbits = JPEG_NBITS(nbits);
movd mm_nbits, nbits ; nbits --> MMX register
lea size, [size * 8 + nbits] ; size = size * 8 + nbits;
movd mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4]
; code = actbl->ehufco[size-16];
movzx size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)]
; size = actbl->ehufsi[size-16];
psllq mm_code, mm_nbits ; code <<= nbits;
pand mm_temp, [MASK_BITS(nbits)] ; temp &= (1 << nbits) - 1;
lea nbits, [nbits + size] ; nbits += size;
por mm_code, mm_temp ; code |= temp;
xor size, size ; size = 0; /* kill tzcnt input dependency */
sub free_bits, nbits ; if ((free_bits -= nbits) <= 0)
jle .EMIT_ERLOOP2 ; insert code, flush buffer, init size, goto .BLOOP2
tzcnt size, index ; size = # of trailing 0 bits in index
movd mm_nbits, nbits ; nbits --> MMX register
psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
inc size ; ++size;
por mm_put_buffer, mm_code ; put_buffer |= code;
test index, index
jnz .BLOOP2 ; } while (index != 0);
.ELOOP2: ; .ELOOP2:
mov nbits, t ; nbits = t;
lea t, [t + SIZEOF_WORD] ; t = &t[1];
and nbits, DCTSIZE2 * SIZEOF_WORD - 1 ; nbits &= 127;
and t, -DCTSIZE2 * SIZEOF_WORD ; t &= -128; /* t = &t_[0]; */
cmp nbits, (DCTSIZE2 - 2) * SIZEOF_WORD ; if (nbits != 62 * 2)
je .EFN ; {
movd mm_code, dword [actbl + c_derived_tbl.ehufco + 0]
; code = actbl->ehufco[0];
movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0]
; nbits = actbl->ehufsi[0];
sub free_bits, nbits ; if ((free_bits -= nbits) <= 0)
jg .EFN_SKIP_EMIT_CODE ; {
EMIT_QWORD size, sizeb, sizeh, , , , , , .EFN ; insert code, flush put_buffer
align 16
.EFN_SKIP_EMIT_CODE: ; } else {
movd mm_nbits, nbits ; nbits --> MMX register
psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
por mm_put_buffer, mm_code ; put_buffer |= code;
.EFN: ; } }
%define frame esp
mov frame, [t + save_frame]
%define state ecx
mov state, [frame + arg_state]
movq [state + working_state.cur.put_buffer.simd], mm_put_buffer
; state->cur.put_buffer.simd = put_buffer;
emms
mov [state + working_state.cur.free_bits], free_bits
; state->cur.free_bits = free_bits;
POP edi
POP esi
POP ebp
POP ebx
ret
; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
align 16
.EMIT_BRLOOP1:
EMIT_QWORD emit_temp, emit_tempb, emit_temph, , , , , , \
.ERLOOP1
; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
align 16
.EMIT_ERLOOP1:
EMIT_QWORD size, sizeb, sizeh, \
{ xor size, size }, \
{ tzcnt size, index }, \
{ inc size }, \
{ test index, index }, \
{ jnz .BLOOP1 }, \
.ELOOP1
; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
align 16
.EMIT_BRLOOP2:
EMIT_QWORD emit_temp, emit_tempb, emit_temph, , , , \
{ cmp size, 16 }, \
{ jle .ERLOOP2 }, \
.BRLOOP2
; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
align 16
.EMIT_ERLOOP2:
EMIT_QWORD size, sizeb, sizeh, \
{ xor size, size }, \
{ tzcnt size, index }, \
{ inc size }, \
{ test index, index }, \
{ jnz .BLOOP2 }, \
.ELOOP2
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 32

@ -0,0 +1,662 @@
;
; jcphuff-sse2.asm - prepare data for progressive Huffman encoding (SSE2)
;
; Copyright (C) 2016, 2018, Matthieu Darbois
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
; This file contains an SSE2 implementation of data preparation for progressive
; Huffman encoding. See jcphuff.c for more details.
%include "jsimdext.inc"
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
; --------------------------------------------------------------------------
; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
; jsimd_encode_mcu_AC_refine_prepare_sse2()
%macro LOAD16 0
pxor N0, N0
pxor N1, N1
mov T0, INT [LUT + 0*SIZEOF_INT]
mov T1, INT [LUT + 8*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 0
pinsrw X1, word [BLOCK + T1 * 2], 0
mov T0, INT [LUT + 1*SIZEOF_INT]
mov T1, INT [LUT + 9*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 1
pinsrw X1, word [BLOCK + T1 * 2], 1
mov T0, INT [LUT + 2*SIZEOF_INT]
mov T1, INT [LUT + 10*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 2
pinsrw X1, word [BLOCK + T1 * 2], 2
mov T0, INT [LUT + 3*SIZEOF_INT]
mov T1, INT [LUT + 11*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 3
pinsrw X1, word [BLOCK + T1 * 2], 3
mov T0, INT [LUT + 4*SIZEOF_INT]
mov T1, INT [LUT + 12*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 4
pinsrw X1, word [BLOCK + T1 * 2], 4
mov T0, INT [LUT + 5*SIZEOF_INT]
mov T1, INT [LUT + 13*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 5
pinsrw X1, word [BLOCK + T1 * 2], 5
mov T0, INT [LUT + 6*SIZEOF_INT]
mov T1, INT [LUT + 14*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 6
pinsrw X1, word [BLOCK + T1 * 2], 6
mov T0, INT [LUT + 7*SIZEOF_INT]
mov T1, INT [LUT + 15*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 7
pinsrw X1, word [BLOCK + T1 * 2], 7
%endmacro
%macro LOAD15 0
pxor N0, N0
pxor N1, N1
pxor X1, X1
mov T0, INT [LUT + 0*SIZEOF_INT]
mov T1, INT [LUT + 8*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 0
pinsrw X1, word [BLOCK + T1 * 2], 0
mov T0, INT [LUT + 1*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 1
mov T0, INT [LUT + 2*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 2
mov T0, INT [LUT + 3*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 3
mov T0, INT [LUT + 4*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 4
mov T0, INT [LUT + 5*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 5
mov T0, INT [LUT + 6*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 6
mov T0, INT [LUT + 7*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 7
cmp LENEND, 2
jl %%.ELOAD15
mov T1, INT [LUT + 9*SIZEOF_INT]
pinsrw X1, word [BLOCK + T1 * 2], 1
cmp LENEND, 3
jl %%.ELOAD15
mov T1, INT [LUT + 10*SIZEOF_INT]
pinsrw X1, word [BLOCK + T1 * 2], 2
cmp LENEND, 4
jl %%.ELOAD15
mov T1, INT [LUT + 11*SIZEOF_INT]
pinsrw X1, word [BLOCK + T1 * 2], 3
cmp LENEND, 5
jl %%.ELOAD15
mov T1, INT [LUT + 12*SIZEOF_INT]
pinsrw X1, word [BLOCK + T1 * 2], 4
cmp LENEND, 6
jl %%.ELOAD15
mov T1, INT [LUT + 13*SIZEOF_INT]
pinsrw X1, word [BLOCK + T1 * 2], 5
cmp LENEND, 7
jl %%.ELOAD15
mov T1, INT [LUT + 14*SIZEOF_INT]
pinsrw X1, word [BLOCK + T1 * 2], 6
%%.ELOAD15:
%endmacro
%macro LOAD8 0
pxor N0, N0
mov T0, INT [LUT + 0*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 0
mov T0, INT [LUT + 1*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 1
mov T0, INT [LUT + 2*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 2
mov T0, INT [LUT + 3*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 3
mov T0, INT [LUT + 4*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 4
mov T0, INT [LUT + 5*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 5
mov T0, INT [LUT + 6*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 6
mov T0, INT [LUT + 7*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 7
%endmacro
%macro LOAD7 0
pxor N0, N0
pxor X0, X0
mov T1, INT [LUT + 0*SIZEOF_INT]
pinsrw X0, word [BLOCK + T1 * 2], 0
cmp LENEND, 2
jl %%.ELOAD7
mov T1, INT [LUT + 1*SIZEOF_INT]
pinsrw X0, word [BLOCK + T1 * 2], 1
cmp LENEND, 3
jl %%.ELOAD7
mov T1, INT [LUT + 2*SIZEOF_INT]
pinsrw X0, word [BLOCK + T1 * 2], 2
cmp LENEND, 4
jl %%.ELOAD7
mov T1, INT [LUT + 3*SIZEOF_INT]
pinsrw X0, word [BLOCK + T1 * 2], 3
cmp LENEND, 5
jl %%.ELOAD7
mov T1, INT [LUT + 4*SIZEOF_INT]
pinsrw X0, word [BLOCK + T1 * 2], 4
cmp LENEND, 6
jl %%.ELOAD7
mov T1, INT [LUT + 5*SIZEOF_INT]
pinsrw X0, word [BLOCK + T1 * 2], 5
cmp LENEND, 7
jl %%.ELOAD7
mov T1, INT [LUT + 6*SIZEOF_INT]
pinsrw X0, word [BLOCK + T1 * 2], 6
%%.ELOAD7:
%endmacro
%macro REDUCE0 0
movdqa xmm0, XMMWORD [VALUES + ( 0*2)]
movdqa xmm1, XMMWORD [VALUES + ( 8*2)]
movdqa xmm2, XMMWORD [VALUES + (16*2)]
movdqa xmm3, XMMWORD [VALUES + (24*2)]
movdqa xmm4, XMMWORD [VALUES + (32*2)]
movdqa xmm5, XMMWORD [VALUES + (40*2)]
movdqa xmm6, XMMWORD [VALUES + (48*2)]
pcmpeqw xmm0, ZERO
pcmpeqw xmm1, ZERO
pcmpeqw xmm2, ZERO
pcmpeqw xmm3, ZERO
pcmpeqw xmm4, ZERO
pcmpeqw xmm5, ZERO
pcmpeqw xmm6, ZERO
pcmpeqw xmm7, XMMWORD [VALUES + (56*2)]
packsswb xmm0, xmm1
packsswb xmm2, xmm3
packsswb xmm4, xmm5
packsswb xmm6, xmm7
pmovmskb eax, xmm0
pmovmskb ecx, xmm2
pmovmskb edx, xmm4
pmovmskb esi, xmm6
shl ecx, 16
shl esi, 16
or eax, ecx
or edx, esi
not eax
not edx
mov edi, ZEROBITS
mov INT [edi], eax
mov INT [edi+SIZEOF_INT], edx
%endmacro
;
; Prepare data for jsimd_encode_mcu_AC_first().
;
; GLOBAL(void)
; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
; const int *jpeg_natural_order_start,
; int Sl, int Al, JCOEF *values,
; size_t *zerobits)
;
; eax + 8 = const JCOEF *block
; eax + 12 = const int *jpeg_natural_order_start
; eax + 16 = int Sl
; eax + 20 = int Al
; eax + 24 = JCOEF *values
; eax + 28 = size_t *zerobits
%define ZERO xmm7
%define X0 xmm0
%define X1 xmm1
%define N0 xmm2
%define N1 xmm3
%define AL xmm4
%define K eax
%define LENEND eax
%define LUT ebx
%define T0 ecx
%define T1 edx
%define BLOCK esi
%define VALUES edi
%define LEN ebp
%define ZEROBITS INT [esp + 5 * 4]
align 32
GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
sub esp, 4
push ebx
push ecx
; push edx ; need not be preserved
push esi
push edi
push ebp
mov BLOCK, INT [eax + 8]
mov LUT, INT [eax + 12]
mov VALUES, INT [eax + 24]
movd AL, INT [eax + 20]
mov T0, INT [eax + 28]
mov ZEROBITS, T0
mov LEN, INT [eax + 16]
pxor ZERO, ZERO
mov K, LEN
and K, -16
shr K, 4
jz .ELOOP16
.BLOOP16:
LOAD16
pcmpgtw N0, X0
pcmpgtw N1, X1
paddw X0, N0
paddw X1, N1
pxor X0, N0
pxor X1, N1
psrlw X0, AL
psrlw X1, AL
pxor N0, X0
pxor N1, X1
movdqa XMMWORD [VALUES + (0) * 2], X0
movdqa XMMWORD [VALUES + (8) * 2], X1
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
add VALUES, 16*2
add LUT, 16*SIZEOF_INT
dec K
jnz .BLOOP16
test LEN, 15
je .PADDING
.ELOOP16:
mov LENEND, LEN
and LENEND, 7
test LEN, 8
jz .TRY7
test LEN, 7
jz .TRY8
LOAD15
pcmpgtw N0, X0
pcmpgtw N1, X1
paddw X0, N0
paddw X1, N1
pxor X0, N0
pxor X1, N1
psrlw X0, AL
psrlw X1, AL
pxor N0, X0
pxor N1, X1
movdqa XMMWORD [VALUES + (0) * 2], X0
movdqa XMMWORD [VALUES + (8) * 2], X1
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
add VALUES, 16*2
jmp .PADDING
.TRY8:
LOAD8
pcmpgtw N0, X0
paddw X0, N0
pxor X0, N0
psrlw X0, AL
pxor N0, X0
movdqa XMMWORD [VALUES + (0) * 2], X0
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
add VALUES, 8*2
jmp .PADDING
.TRY7:
LOAD7
pcmpgtw N0, X0
paddw X0, N0
pxor X0, N0
psrlw X0, AL
pxor N0, X0
movdqa XMMWORD [VALUES + (0) * 2], X0
movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
add VALUES, 8*2
.PADDING:
mov K, LEN
add K, 7
and K, -8
shr K, 3
sub K, DCTSIZE2/8
jz .EPADDING
align 16
.ZEROLOOP:
movdqa XMMWORD [VALUES + 0], ZERO
add VALUES, 8*2
inc K
jnz .ZEROLOOP
.EPADDING:
sub VALUES, DCTSIZE2*2
REDUCE0
pop ebp
pop edi
pop esi
; pop edx ; need not be preserved
pop ecx
pop ebx
mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
%undef ZERO
%undef X0
%undef X1
%undef N0
%undef N1
%undef AL
%undef K
%undef LUT
%undef T0
%undef T1
%undef BLOCK
%undef VALUES
%undef LEN
;
; Prepare data for jsimd_encode_mcu_AC_refine().
;
; GLOBAL(int)
; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
; const int *jpeg_natural_order_start,
; int Sl, int Al, JCOEF *absvalues,
; size_t *bits)
;
; eax + 8 = const JCOEF *block
; eax + 12 = const int *jpeg_natural_order_start
; eax + 16 = int Sl
; eax + 20 = int Al
; eax + 24 = JCOEF *values
; eax + 28 = size_t *bits
%define ZERO xmm7
%define ONE xmm5
%define X0 xmm0
%define X1 xmm1
%define N0 xmm2
%define N1 xmm3
%define AL xmm4
%define K eax
%define LENEND eax
%define LUT ebx
%define T0 ecx
%define T0w cx
%define T1 edx
%define BLOCK esi
%define VALUES edi
%define KK ebp
%define ZEROBITS INT [esp + 5 * 4]
%define EOB INT [esp + 5 * 4 + 4]
%define LEN INT [esp + 5 * 4 + 8]
align 32
GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
sub esp, 16
push ebx
push ecx
; push edx ; need not be preserved
push esi
push edi
push ebp
pcmpeqw ONE, ONE
psrlw ONE, 15
mov BLOCK, INT [eax + 8]
mov LUT, INT [eax + 12]
mov VALUES, INT [eax + 24]
movd AL, INT [eax + 20]
mov T0, INT [eax + 28]
mov K, INT [eax + 16]
mov INT [T0 + 2 * SIZEOF_INT], -1
mov INT [T0 + 3 * SIZEOF_INT], -1
mov ZEROBITS, T0
mov LEN, K
pxor ZERO, ZERO
and K, -16
mov EOB, 0
xor KK, KK
shr K, 4
jz .ELOOPR16
.BLOOPR16:
LOAD16
pcmpgtw N0, X0
pcmpgtw N1, X1
paddw X0, N0
paddw X1, N1
pxor X0, N0
pxor X1, N1
psrlw X0, AL
psrlw X1, AL
movdqa XMMWORD [VALUES + (0) * 2], X0
movdqa XMMWORD [VALUES + (8) * 2], X1
pcmpeqw X0, ONE
pcmpeqw X1, ONE
packsswb N0, N1
packsswb X0, X1
pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
mov T1, ZEROBITS
not T0
mov word [T1 + 2 * SIZEOF_INT + KK], T0w
pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
jz .CONTINUER16 ; if (idx) {
lea T1, [T1+KK*8]
mov EOB, T1 ; EOB = k + idx;
.CONTINUER16:
add VALUES, 16*2
add LUT, 16*SIZEOF_INT
add KK, 2
dec K
jnz .BLOOPR16
test LEN, 15
je .PADDINGR
.ELOOPR16:
mov LENEND, LEN
test LENEND, 8
jz .TRYR7
test LENEND, 7
jz .TRYR8
and LENEND, 7
LOAD15
pcmpgtw N0, X0
pcmpgtw N1, X1
paddw X0, N0
paddw X1, N1
pxor X0, N0
pxor X1, N1
psrlw X0, AL
psrlw X1, AL
movdqa XMMWORD [VALUES + (0) * 2], X0
movdqa XMMWORD [VALUES + (8) * 2], X1
pcmpeqw X0, ONE
pcmpeqw X1, ONE
packsswb N0, N1
packsswb X0, X1
pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
mov T1, ZEROBITS
not T0
mov word [T1 + 2 * SIZEOF_INT + KK], T0w
pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
jz .CONTINUER15 ; if (idx) {
lea T1, [T1+KK*8]
mov EOB, T1 ; EOB = k + idx;
.CONTINUER15:
add VALUES, 16*2
jmp .PADDINGR
.TRYR8:
LOAD8
pcmpgtw N0, X0
paddw X0, N0
pxor X0, N0
psrlw X0, AL
movdqa XMMWORD [VALUES + (0) * 2], X0
pcmpeqw X0, ONE
packsswb N0, ZERO
packsswb X0, ZERO
pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
mov T1, ZEROBITS
not T0
mov word [T1 + 2 * SIZEOF_INT + KK], T0w
pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
jz .CONTINUER8 ; if (idx) {
lea T1, [T1+KK*8]
mov EOB, T1 ; EOB = k + idx;
.CONTINUER8:
add VALUES, 8*2
jmp .PADDINGR
.TRYR7:
and LENEND, 7
LOAD7
pcmpgtw N0, X0
paddw X0, N0
pxor X0, N0
psrlw X0, AL
movdqa XMMWORD [VALUES + (0) * 2], X0
pcmpeqw X0, ONE
packsswb N0, ZERO
packsswb X0, ZERO
pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
mov T1, ZEROBITS
not T0
mov word [T1 + 2 * SIZEOF_INT + KK], T0w
pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
jz .CONTINUER7 ; if (idx) {
lea T1, [T1+KK*8]
mov EOB, T1 ; EOB = k + idx;
.CONTINUER7:
add VALUES, 8*2
.PADDINGR:
mov K, LEN
add K, 7
and K, -8
shr K, 3
sub K, DCTSIZE2/8
jz .EPADDINGR
align 16
.ZEROLOOPR:
movdqa XMMWORD [VALUES + 0], ZERO
add VALUES, 8*2
inc K
jnz .ZEROLOOPR
.EPADDINGR:
sub VALUES, DCTSIZE2*2
REDUCE0
mov eax, EOB
pop ebp
pop edi
pop esi
; pop edx ; need not be preserved
pop ecx
pop ebx
mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
%undef ZERO
%undef ONE
%undef X0
%undef X1
%undef N0
%undef N1
%undef AL
%undef K
%undef KK
%undef EOB
%undef SIGN
%undef LUT
%undef T0
%undef T1
%undef BLOCK
%undef VALUES
%undef LEN
%undef LENEND
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 32

@ -0,0 +1,388 @@
;
; jcsample.asm - downsampling (AVX2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2015, Intel Corporation.
; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
%include "jsimdext.inc"
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
;
; Downsample pixel values of a single component.
; This version handles the common case of 2:1 horizontal and 1:1 vertical,
; without smoothing.
;
; GLOBAL(void)
; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
; JDIMENSION v_samp_factor,
; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
; JSAMPARRAY output_data);
;
%define img_width(b) (b) + 8 ; JDIMENSION image_width
%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor
%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor
%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks
%define input_data(b) (b) + 24 ; JSAMPARRAY input_data
%define output_data(b) (b) + 28 ; JSAMPARRAY output_data
align 32
GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2)
EXTN(jsimd_h2v1_downsample_avx2):
push ebp
mov ebp, esp
; push ebx ; unused
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
mov ecx, JDIMENSION [width_blks(ebp)]
shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
jz near .return
mov edx, JDIMENSION [img_width(ebp)]
; -- expand_right_edge
push ecx
shl ecx, 1 ; output_cols * 2
sub ecx, edx
jle short .expand_end
mov eax, INT [max_v_samp(ebp)]
test eax, eax
jle short .expand_end
cld
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
alignx 16, 7
.expandloop:
push eax
push ecx
mov edi, JSAMPROW [esi]
add edi, edx
mov al, JSAMPLE [edi-1]
rep stosb
pop ecx
pop eax
add esi, byte SIZEOF_JSAMPROW
dec eax
jg short .expandloop
.expand_end:
pop ecx ; output_cols
; -- h2v1_downsample
mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
test eax, eax
jle near .return
mov edx, 0x00010000 ; bias pattern
vmovd xmm7, edx
vpshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
vperm2i128 ymm7, ymm7, ymm7, 0 ; ymm7={xmm7, xmm7}
vpcmpeqw ymm6, ymm6, ymm6
vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..}
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
alignx 16, 7
.rowloop:
push ecx
push edi
push esi
mov esi, JSAMPROW [esi] ; inptr
mov edi, JSAMPROW [edi] ; outptr
cmp ecx, byte SIZEOF_YMMWORD
jae short .columnloop
alignx 16, 7
.columnloop_r24:
; ecx can possibly be 8, 16, 24
cmp ecx, 24
jne .columnloop_r16
vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
vmovdqu xmm1, XMMWORD [esi+1*SIZEOF_YMMWORD]
mov ecx, SIZEOF_YMMWORD
jmp short .downsample
.columnloop_r16:
cmp ecx, 16
jne .columnloop_r8
vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
vpxor ymm1, ymm1, ymm1
mov ecx, SIZEOF_YMMWORD
jmp short .downsample
.columnloop_r8:
vmovdqu xmm0, XMMWORD[esi+0*SIZEOF_YMMWORD]
vpxor ymm1, ymm1, ymm1
mov ecx, SIZEOF_YMMWORD
jmp short .downsample
alignx 16, 7
.columnloop:
vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
vmovdqu ymm1, YMMWORD [esi+1*SIZEOF_YMMWORD]
.downsample:
vpsrlw ymm2, ymm0, BYTE_BIT
vpand ymm0, ymm0, ymm6
vpsrlw ymm3, ymm1, BYTE_BIT
vpand ymm1, ymm1, ymm6
vpaddw ymm0, ymm0, ymm2
vpaddw ymm1, ymm1, ymm3
vpaddw ymm0, ymm0, ymm7
vpaddw ymm1, ymm1, ymm7
vpsrlw ymm0, ymm0, 1
vpsrlw ymm1, ymm1, 1
vpackuswb ymm0, ymm0, ymm1
vpermq ymm0, ymm0, 0xd8
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
sub ecx, byte SIZEOF_YMMWORD ; outcol
add esi, byte 2*SIZEOF_YMMWORD ; inptr
add edi, byte 1*SIZEOF_YMMWORD ; outptr
cmp ecx, byte SIZEOF_YMMWORD
jae short .columnloop
test ecx, ecx
jnz near .columnloop_r24
pop esi
pop edi
pop ecx
add esi, byte SIZEOF_JSAMPROW ; input_data
add edi, byte SIZEOF_JSAMPROW ; output_data
dec eax ; rowctr
jg near .rowloop
.return:
vzeroupper
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
; pop ebx ; unused
pop ebp
ret
; --------------------------------------------------------------------------
;
; Downsample pixel values of a single component.
; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
; without smoothing.
;
; GLOBAL(void)
; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
; JDIMENSION v_samp_factor,
; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
; JSAMPARRAY output_data);
;
%define img_width(b) (b) + 8 ; JDIMENSION image_width
%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor
%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor
%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks
%define input_data(b) (b) + 24 ; JSAMPARRAY input_data
%define output_data(b) (b) + 28 ; JSAMPARRAY output_data
align 32
GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2)
EXTN(jsimd_h2v2_downsample_avx2):
push ebp
mov ebp, esp
; push ebx ; unused
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
mov ecx, JDIMENSION [width_blks(ebp)]
shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
jz near .return
mov edx, JDIMENSION [img_width(ebp)]
; -- expand_right_edge
push ecx
shl ecx, 1 ; output_cols * 2
sub ecx, edx
jle short .expand_end
mov eax, INT [max_v_samp(ebp)]
test eax, eax
jle short .expand_end
cld
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
alignx 16, 7
.expandloop:
push eax
push ecx
mov edi, JSAMPROW [esi]
add edi, edx
mov al, JSAMPLE [edi-1]
rep stosb
pop ecx
pop eax
add esi, byte SIZEOF_JSAMPROW
dec eax
jg short .expandloop
.expand_end:
pop ecx ; output_cols
; -- h2v2_downsample
mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
test eax, eax
jle near .return
mov edx, 0x00020001 ; bias pattern
vmovd xmm7, edx
vpcmpeqw ymm6, ymm6, ymm6
vpshufd xmm7, xmm7, 0x00 ; ymm7={1, 2, 1, 2, 1, 2, 1, 2}
vperm2i128 ymm7, ymm7, ymm7, 0
vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..}
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
alignx 16, 7
.rowloop:
push ecx
push edi
push esi
mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
mov edi, JSAMPROW [edi] ; outptr
cmp ecx, byte SIZEOF_YMMWORD
jae short .columnloop
alignx 16, 7
.columnloop_r24:
cmp ecx, 24
jne .columnloop_r16
vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
vmovdqu xmm2, XMMWORD [edx+1*SIZEOF_YMMWORD]
vmovdqu xmm3, XMMWORD [esi+1*SIZEOF_YMMWORD]
mov ecx, SIZEOF_YMMWORD
jmp short .downsample
.columnloop_r16:
cmp ecx, 16
jne .columnloop_r8
vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
vpxor ymm2, ymm2, ymm2
vpxor ymm3, ymm3, ymm3
mov ecx, SIZEOF_YMMWORD
jmp short .downsample
.columnloop_r8:
vmovdqu xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
vmovdqu xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
vpxor ymm2, ymm2, ymm2
vpxor ymm3, ymm3, ymm3
mov ecx, SIZEOF_YMMWORD
jmp short .downsample
alignx 16, 7
.columnloop:
vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
vmovdqu ymm2, YMMWORD [edx+1*SIZEOF_YMMWORD]
vmovdqu ymm3, YMMWORD [esi+1*SIZEOF_YMMWORD]
.downsample:
vpand ymm4, ymm0, ymm6
vpsrlw ymm0, ymm0, BYTE_BIT
vpand ymm5, ymm1, ymm6
vpsrlw ymm1, ymm1, BYTE_BIT
vpaddw ymm0, ymm0, ymm4
vpaddw ymm1, ymm1, ymm5
vpand ymm4, ymm2, ymm6
vpsrlw ymm2, ymm2, BYTE_BIT
vpand ymm5, ymm3, ymm6
vpsrlw ymm3, ymm3, BYTE_BIT
vpaddw ymm2, ymm2, ymm4
vpaddw ymm3, ymm3, ymm5
vpaddw ymm0, ymm0, ymm1
vpaddw ymm2, ymm2, ymm3
vpaddw ymm0, ymm0, ymm7
vpaddw ymm2, ymm2, ymm7
vpsrlw ymm0, ymm0, 2
vpsrlw ymm2, ymm2, 2
vpackuswb ymm0, ymm0, ymm2
vpermq ymm0, ymm0, 0xd8
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
sub ecx, byte SIZEOF_YMMWORD ; outcol
add edx, byte 2*SIZEOF_YMMWORD ; inptr0
add esi, byte 2*SIZEOF_YMMWORD ; inptr1
add edi, byte 1*SIZEOF_YMMWORD ; outptr
cmp ecx, byte SIZEOF_YMMWORD
jae near .columnloop
test ecx, ecx
jnz near .columnloop_r24
pop esi
pop edi
pop ecx
add esi, byte 2*SIZEOF_JSAMPROW ; input_data
add edi, byte 1*SIZEOF_JSAMPROW ; output_data
dec eax ; rowctr
jg near .rowloop
.return:
vzeroupper
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
; pop ebx ; unused
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 32

@ -0,0 +1,324 @@
;
; jcsample.asm - downsampling (MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
%include "jsimdext.inc"
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
;
; Downsample pixel values of a single component.
; This version handles the common case of 2:1 horizontal and 1:1 vertical,
; without smoothing.
;
; GLOBAL(void)
; jsimd_h2v1_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor,
; JDIMENSION v_samp_factor,
; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
; JSAMPARRAY output_data);
;
%define img_width(b) (b) + 8 ; JDIMENSION image_width
%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor
%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor
%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks
%define input_data(b) (b) + 24 ; JSAMPARRAY input_data
%define output_data(b) (b) + 28 ; JSAMPARRAY output_data
align 32
GLOBAL_FUNCTION(jsimd_h2v1_downsample_mmx)
EXTN(jsimd_h2v1_downsample_mmx):
push ebp
mov ebp, esp
; push ebx ; unused
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
mov ecx, JDIMENSION [width_blks(ebp)]
shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
jz near .return
mov edx, JDIMENSION [img_width(ebp)]
; -- expand_right_edge
push ecx
shl ecx, 1 ; output_cols * 2
sub ecx, edx
jle short .expand_end
mov eax, INT [max_v_samp(ebp)]
test eax, eax
jle short .expand_end
cld
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
alignx 16, 7
.expandloop:
push eax
push ecx
mov edi, JSAMPROW [esi]
add edi, edx
mov al, JSAMPLE [edi-1]
rep stosb
pop ecx
pop eax
add esi, byte SIZEOF_JSAMPROW
dec eax
jg short .expandloop
.expand_end:
pop ecx ; output_cols
; -- h2v1_downsample
mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
test eax, eax
jle near .return
mov edx, 0x00010000 ; bias pattern
movd mm7, edx
pcmpeqw mm6, mm6
punpckldq mm7, mm7 ; mm7={0, 1, 0, 1}
psrlw mm6, BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..}
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
alignx 16, 7
.rowloop:
push ecx
push edi
push esi
mov esi, JSAMPROW [esi] ; inptr
mov edi, JSAMPROW [edi] ; outptr
alignx 16, 7
.columnloop:
movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
movq mm1, MMWORD [esi+1*SIZEOF_MMWORD]
movq mm2, mm0
movq mm3, mm1
pand mm0, mm6
psrlw mm2, BYTE_BIT
pand mm1, mm6
psrlw mm3, BYTE_BIT
paddw mm0, mm2
paddw mm1, mm3
paddw mm0, mm7
paddw mm1, mm7
psrlw mm0, 1
psrlw mm1, 1
packuswb mm0, mm1
movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
add esi, byte 2*SIZEOF_MMWORD ; inptr
add edi, byte 1*SIZEOF_MMWORD ; outptr
sub ecx, byte SIZEOF_MMWORD ; outcol
jnz short .columnloop
pop esi
pop edi
pop ecx
add esi, byte SIZEOF_JSAMPROW ; input_data
add edi, byte SIZEOF_JSAMPROW ; output_data
dec eax ; rowctr
jg short .rowloop
emms ; empty MMX state
.return:
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
; pop ebx ; unused
pop ebp
ret
; --------------------------------------------------------------------------
;
; Downsample pixel values of a single component.
; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
; without smoothing.
;
; GLOBAL(void)
; jsimd_h2v2_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor,
; JDIMENSION v_samp_factor,
; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
; JSAMPARRAY output_data);
;
%define img_width(b) (b) + 8 ; JDIMENSION image_width
%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor
%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor
%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks
%define input_data(b) (b) + 24 ; JSAMPARRAY input_data
%define output_data(b) (b) + 28 ; JSAMPARRAY output_data
align 32
GLOBAL_FUNCTION(jsimd_h2v2_downsample_mmx)
EXTN(jsimd_h2v2_downsample_mmx):
push ebp
mov ebp, esp
; push ebx ; unused
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
mov ecx, JDIMENSION [width_blks(ebp)]
shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
jz near .return
mov edx, JDIMENSION [img_width(ebp)]
; -- expand_right_edge
push ecx
shl ecx, 1 ; output_cols * 2
sub ecx, edx
jle short .expand_end
mov eax, INT [max_v_samp(ebp)]
test eax, eax
jle short .expand_end
cld
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
alignx 16, 7
.expandloop:
push eax
push ecx
mov edi, JSAMPROW [esi]
add edi, edx
mov al, JSAMPLE [edi-1]
rep stosb
pop ecx
pop eax
add esi, byte SIZEOF_JSAMPROW
dec eax
jg short .expandloop
.expand_end:
pop ecx ; output_cols
; -- h2v2_downsample
mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
test eax, eax
jle near .return
mov edx, 0x00020001 ; bias pattern
movd mm7, edx
pcmpeqw mm6, mm6
punpckldq mm7, mm7 ; mm7={1, 2, 1, 2}
psrlw mm6, BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..}
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
alignx 16, 7
.rowloop:
push ecx
push edi
push esi
mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
mov edi, JSAMPROW [edi] ; outptr
alignx 16, 7
.columnloop:
movq mm0, MMWORD [edx+0*SIZEOF_MMWORD]
movq mm1, MMWORD [esi+0*SIZEOF_MMWORD]
movq mm2, MMWORD [edx+1*SIZEOF_MMWORD]
movq mm3, MMWORD [esi+1*SIZEOF_MMWORD]
movq mm4, mm0
movq mm5, mm1
pand mm0, mm6
psrlw mm4, BYTE_BIT
pand mm1, mm6
psrlw mm5, BYTE_BIT
paddw mm0, mm4
paddw mm1, mm5
movq mm4, mm2
movq mm5, mm3
pand mm2, mm6
psrlw mm4, BYTE_BIT
pand mm3, mm6
psrlw mm5, BYTE_BIT
paddw mm2, mm4
paddw mm3, mm5
paddw mm0, mm1
paddw mm2, mm3
paddw mm0, mm7
paddw mm2, mm7
psrlw mm0, 2
psrlw mm2, 2
packuswb mm0, mm2
movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
add edx, byte 2*SIZEOF_MMWORD ; inptr0
add esi, byte 2*SIZEOF_MMWORD ; inptr1
add edi, byte 1*SIZEOF_MMWORD ; outptr
sub ecx, byte SIZEOF_MMWORD ; outcol
jnz near .columnloop
pop esi
pop edi
pop ecx
add esi, byte 2*SIZEOF_JSAMPROW ; input_data
add edi, byte 1*SIZEOF_JSAMPROW ; output_data
dec eax ; rowctr
jg near .rowloop
emms ; empty MMX state
.return:
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
; pop ebx ; unused
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 32

@ -0,0 +1,351 @@
;
; jcsample.asm - downsampling (SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
%include "jsimdext.inc"
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
;
; Downsample pixel values of a single component.
; This version handles the common case of 2:1 horizontal and 1:1 vertical,
; without smoothing.
;
; GLOBAL(void)
; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
; JDIMENSION v_samp_factor,
; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
; JSAMPARRAY output_data);
;
%define img_width(b) (b) + 8 ; JDIMENSION image_width
%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor
%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor
%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks
%define input_data(b) (b) + 24 ; JSAMPARRAY input_data
%define output_data(b) (b) + 28 ; JSAMPARRAY output_data
align 32
GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2)
EXTN(jsimd_h2v1_downsample_sse2):
push ebp
mov ebp, esp
; push ebx ; unused
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
mov ecx, JDIMENSION [width_blks(ebp)]
shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
jz near .return
mov edx, JDIMENSION [img_width(ebp)]
; -- expand_right_edge
push ecx
shl ecx, 1 ; output_cols * 2
sub ecx, edx
jle short .expand_end
mov eax, INT [max_v_samp(ebp)]
test eax, eax
jle short .expand_end
cld
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
alignx 16, 7
.expandloop:
push eax
push ecx
mov edi, JSAMPROW [esi]
add edi, edx
mov al, JSAMPLE [edi-1]
rep stosb
pop ecx
pop eax
add esi, byte SIZEOF_JSAMPROW
dec eax
jg short .expandloop
.expand_end:
pop ecx ; output_cols
; -- h2v1_downsample
mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
test eax, eax
jle near .return
mov edx, 0x00010000 ; bias pattern
movd xmm7, edx
pcmpeqw xmm6, xmm6
pshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
alignx 16, 7
.rowloop:
push ecx
push edi
push esi
mov esi, JSAMPROW [esi] ; inptr
mov edi, JSAMPROW [edi] ; outptr
cmp ecx, byte SIZEOF_XMMWORD
jae short .columnloop
alignx 16, 7
.columnloop_r8:
movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
pxor xmm1, xmm1
mov ecx, SIZEOF_XMMWORD
jmp short .downsample
alignx 16, 7
.columnloop:
movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
.downsample:
movdqa xmm2, xmm0
movdqa xmm3, xmm1
pand xmm0, xmm6
psrlw xmm2, BYTE_BIT
pand xmm1, xmm6
psrlw xmm3, BYTE_BIT
paddw xmm0, xmm2
paddw xmm1, xmm3
paddw xmm0, xmm7
paddw xmm1, xmm7
psrlw xmm0, 1
psrlw xmm1, 1
packuswb xmm0, xmm1
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
sub ecx, byte SIZEOF_XMMWORD ; outcol
add esi, byte 2*SIZEOF_XMMWORD ; inptr
add edi, byte 1*SIZEOF_XMMWORD ; outptr
cmp ecx, byte SIZEOF_XMMWORD
jae short .columnloop
test ecx, ecx
jnz short .columnloop_r8
pop esi
pop edi
pop ecx
add esi, byte SIZEOF_JSAMPROW ; input_data
add edi, byte SIZEOF_JSAMPROW ; output_data
dec eax ; rowctr
jg near .rowloop
.return:
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
; pop ebx ; unused
pop ebp
ret
; --------------------------------------------------------------------------
;
; Downsample pixel values of a single component.
; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
; without smoothing.
;
; GLOBAL(void)
; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
; JDIMENSION v_samp_factor,
; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
; JSAMPARRAY output_data);
;
%define img_width(b) (b) + 8 ; JDIMENSION image_width
%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor
%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor
%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks
%define input_data(b) (b) + 24 ; JSAMPARRAY input_data
%define output_data(b) (b) + 28 ; JSAMPARRAY output_data
align 32
GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2)
EXTN(jsimd_h2v2_downsample_sse2):
push ebp
mov ebp, esp
; push ebx ; unused
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
mov ecx, JDIMENSION [width_blks(ebp)]
shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
jz near .return
mov edx, JDIMENSION [img_width(ebp)]
; -- expand_right_edge
push ecx
shl ecx, 1 ; output_cols * 2
sub ecx, edx
jle short .expand_end
mov eax, INT [max_v_samp(ebp)]
test eax, eax
jle short .expand_end
cld
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
alignx 16, 7
.expandloop:
push eax
push ecx
mov edi, JSAMPROW [esi]
add edi, edx
mov al, JSAMPLE [edi-1]
rep stosb
pop ecx
pop eax
add esi, byte SIZEOF_JSAMPROW
dec eax
jg short .expandloop
.expand_end:
pop ecx ; output_cols
; -- h2v2_downsample
mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
test eax, eax
jle near .return
mov edx, 0x00020001 ; bias pattern
movd xmm7, edx
pcmpeqw xmm6, xmm6
pshufd xmm7, xmm7, 0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
alignx 16, 7
.rowloop:
push ecx
push edi
push esi
mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
mov edi, JSAMPROW [edi] ; outptr
cmp ecx, byte SIZEOF_XMMWORD
jae short .columnloop
alignx 16, 7
.columnloop_r8:
movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
pxor xmm2, xmm2
pxor xmm3, xmm3
mov ecx, SIZEOF_XMMWORD
jmp short .downsample
alignx 16, 7
.columnloop:
movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
.downsample:
movdqa xmm4, xmm0
movdqa xmm5, xmm1
pand xmm0, xmm6
psrlw xmm4, BYTE_BIT
pand xmm1, xmm6
psrlw xmm5, BYTE_BIT
paddw xmm0, xmm4
paddw xmm1, xmm5
movdqa xmm4, xmm2
movdqa xmm5, xmm3
pand xmm2, xmm6
psrlw xmm4, BYTE_BIT
pand xmm3, xmm6
psrlw xmm5, BYTE_BIT
paddw xmm2, xmm4
paddw xmm3, xmm5
paddw xmm0, xmm1
paddw xmm2, xmm3
paddw xmm0, xmm7
paddw xmm2, xmm7
psrlw xmm0, 2
psrlw xmm2, 2
packuswb xmm0, xmm2
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
sub ecx, byte SIZEOF_XMMWORD ; outcol
add edx, byte 2*SIZEOF_XMMWORD ; inptr0
add esi, byte 2*SIZEOF_XMMWORD ; inptr1
add edi, byte 1*SIZEOF_XMMWORD ; outptr
cmp ecx, byte SIZEOF_XMMWORD
jae near .columnloop
test ecx, ecx
jnz near .columnloop_r8
pop esi
pop edi
pop ecx
add esi, byte 2*SIZEOF_JSAMPROW ; input_data
add edi, byte 1*SIZEOF_JSAMPROW ; output_data
dec eax ; rowctr
jg near .rowloop
.return:
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
; pop ebx ; unused
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 32

@ -0,0 +1,515 @@
;
; jdcolext.asm - colorspace conversion (AVX2)
;
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2012, 2016, D. R. Commander.
; Copyright (C) 2015, Intel Corporation.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
%include "jcolsamp.inc"
; --------------------------------------------------------------------------
;
; Convert some rows of samples to the output colorspace.
;
; GLOBAL(void)
; jsimd_ycc_rgb_convert_avx2(JDIMENSION out_width, JSAMPIMAGE input_buf,
; JDIMENSION input_row, JSAMPARRAY output_buf,
; int num_rows)
;
%define out_width(b) (b) + 8 ; JDIMENSION out_width
%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
%define input_row(b) (b) + 16 ; JDIMENSION input_row
%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
%define num_rows(b) (b) + 24 ; int num_rows
%define original_ebp ebp + 0
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
; ymmword wk[WK_NUM]
%define WK_NUM 2
%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
align 32
GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_avx2)
EXTN(jsimd_ycc_rgb_convert_avx2):
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic eax ; make a room for GOT address
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
get_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [out_width(eax)] ; num_cols
test ecx, ecx
jz near .return
push ecx
mov edi, JSAMPIMAGE [input_buf(eax)]
mov ecx, JDIMENSION [input_row(eax)]
mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
lea esi, [esi+ecx*SIZEOF_JSAMPROW]
lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
lea edx, [edx+ecx*SIZEOF_JSAMPROW]
pop ecx
mov edi, JSAMPARRAY [output_buf(eax)]
mov eax, INT [num_rows(eax)]
test eax, eax
jle near .return
alignx 16, 7
.rowloop:
push eax
push edi
push edx
push ebx
push esi
push ecx ; col
mov esi, JSAMPROW [esi] ; inptr0
mov ebx, JSAMPROW [ebx] ; inptr1
mov edx, JSAMPROW [edx] ; inptr2
mov edi, JSAMPROW [edi] ; outptr
movpic eax, POINTER [gotptr] ; load GOT address (eax)
alignx 16, 7
.columnloop:
vmovdqu ymm5, YMMWORD [ebx] ; ymm5=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
vmovdqu ymm1, YMMWORD [edx] ; ymm1=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
vpcmpeqw ymm0, ymm0, ymm0
vpcmpeqw ymm7, ymm7, ymm7
vpsrlw ymm0, ymm0, BYTE_BIT ; ymm0={0xFF 0x00 0xFF 0x00 ..}
vpsllw ymm7, ymm7, 7 ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
vpand ymm4, ymm0, ymm5 ; ymm4=Cb(02468ACEGIKMOQSU)=CbE
vpsrlw ymm5, ymm5, BYTE_BIT ; ymm5=Cb(13579BDFHJLNPRTV)=CbO
vpand ymm0, ymm0, ymm1 ; ymm0=Cr(02468ACEGIKMOQSU)=CrE
vpsrlw ymm1, ymm1, BYTE_BIT ; ymm1=Cr(13579BDFHJLNPRTV)=CrO
vpaddw ymm2, ymm4, ymm7
vpaddw ymm3, ymm5, ymm7
vpaddw ymm6, ymm0, ymm7
vpaddw ymm7, ymm1, ymm7
; (Original)
; R = Y + 1.40200 * Cr
; G = Y - 0.34414 * Cb - 0.71414 * Cr
; B = Y + 1.77200 * Cb
;
; (This implementation)
; R = Y + 0.40200 * Cr + Cr
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
; B = Y - 0.22800 * Cb + Cb + Cb
vpaddw ymm4, ymm2, ymm2 ; ymm4=2*CbE
vpaddw ymm5, ymm3, ymm3 ; ymm5=2*CbO
vpaddw ymm0, ymm6, ymm6 ; ymm0=2*CrE
vpaddw ymm1, ymm7, ymm7 ; ymm1=2*CrO
vpmulhw ymm4, ymm4, [GOTOFF(eax,PW_MF0228)] ; ymm4=(2*CbE * -FIX(0.22800))
vpmulhw ymm5, ymm5, [GOTOFF(eax,PW_MF0228)] ; ymm5=(2*CbO * -FIX(0.22800))
vpmulhw ymm0, ymm0, [GOTOFF(eax,PW_F0402)] ; ymm0=(2*CrE * FIX(0.40200))
vpmulhw ymm1, ymm1, [GOTOFF(eax,PW_F0402)] ; ymm1=(2*CrO * FIX(0.40200))
vpaddw ymm4, ymm4, [GOTOFF(eax,PW_ONE)]
vpaddw ymm5, ymm5, [GOTOFF(eax,PW_ONE)]
vpsraw ymm4, ymm4, 1 ; ymm4=(CbE * -FIX(0.22800))
vpsraw ymm5, ymm5, 1 ; ymm5=(CbO * -FIX(0.22800))
vpaddw ymm0, ymm0, [GOTOFF(eax,PW_ONE)]
vpaddw ymm1, ymm1, [GOTOFF(eax,PW_ONE)]
vpsraw ymm0, ymm0, 1 ; ymm0=(CrE * FIX(0.40200))
vpsraw ymm1, ymm1, 1 ; ymm1=(CrO * FIX(0.40200))
vpaddw ymm4, ymm4, ymm2
vpaddw ymm5, ymm5, ymm3
vpaddw ymm4, ymm4, ymm2 ; ymm4=(CbE * FIX(1.77200))=(B-Y)E
vpaddw ymm5, ymm5, ymm3 ; ymm5=(CbO * FIX(1.77200))=(B-Y)O
vpaddw ymm0, ymm0, ymm6 ; ymm0=(CrE * FIX(1.40200))=(R-Y)E
vpaddw ymm1, ymm1, ymm7 ; ymm1=(CrO * FIX(1.40200))=(R-Y)O
vmovdqa YMMWORD [wk(0)], ymm4 ; wk(0)=(B-Y)E
vmovdqa YMMWORD [wk(1)], ymm5 ; wk(1)=(B-Y)O
vpunpckhwd ymm4, ymm2, ymm6
vpunpcklwd ymm2, ymm2, ymm6
vpmaddwd ymm2, ymm2, [GOTOFF(eax,PW_MF0344_F0285)]
vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_MF0344_F0285)]
vpunpckhwd ymm5, ymm3, ymm7
vpunpcklwd ymm3, ymm3, ymm7
vpmaddwd ymm3, ymm3, [GOTOFF(eax,PW_MF0344_F0285)]
vpmaddwd ymm5, ymm5, [GOTOFF(eax,PW_MF0344_F0285)]
vpaddd ymm2, ymm2, [GOTOFF(eax,PD_ONEHALF)]
vpaddd ymm4, ymm4, [GOTOFF(eax,PD_ONEHALF)]
vpsrad ymm2, ymm2, SCALEBITS
vpsrad ymm4, ymm4, SCALEBITS
vpaddd ymm3, ymm3, [GOTOFF(eax,PD_ONEHALF)]
vpaddd ymm5, ymm5, [GOTOFF(eax,PD_ONEHALF)]
vpsrad ymm3, ymm3, SCALEBITS
vpsrad ymm5, ymm5, SCALEBITS
vpackssdw ymm2, ymm2, ymm4 ; ymm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
vpackssdw ymm3, ymm3, ymm5 ; ymm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
vpsubw ymm2, ymm2, ymm6 ; ymm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
vpsubw ymm3, ymm3, ymm7 ; ymm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
vmovdqu ymm5, YMMWORD [esi] ; ymm5=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
vpcmpeqw ymm4, ymm4, ymm4
vpsrlw ymm4, ymm4, BYTE_BIT ; ymm4={0xFF 0x00 0xFF 0x00 ..}
vpand ymm4, ymm4, ymm5 ; ymm4=Y(02468ACEGIKMOQSU)=YE
vpsrlw ymm5, ymm5, BYTE_BIT ; ymm5=Y(13579BDFHJLNPRTV)=YO
vpaddw ymm0, ymm0, ymm4 ; ymm0=((R-Y)E+YE)=RE=R(02468ACEGIKMOQSU)
vpaddw ymm1, ymm1, ymm5 ; ymm1=((R-Y)O+YO)=RO=R(13579BDFHJLNPRTV)
vpackuswb ymm0, ymm0, ymm0 ; ymm0=R(02468ACE********GIKMOQSU********)
vpackuswb ymm1, ymm1, ymm1 ; ymm1=R(13579BDF********HJLNPRTV********)
vpaddw ymm2, ymm2, ymm4 ; ymm2=((G-Y)E+YE)=GE=G(02468ACEGIKMOQSU)
vpaddw ymm3, ymm3, ymm5 ; ymm3=((G-Y)O+YO)=GO=G(13579BDFHJLNPRTV)
vpackuswb ymm2, ymm2, ymm2 ; ymm2=G(02468ACE********GIKMOQSU********)
vpackuswb ymm3, ymm3, ymm3 ; ymm3=G(13579BDF********HJLNPRTV********)
vpaddw ymm4, ymm4, YMMWORD [wk(0)] ; ymm4=(YE+(B-Y)E)=BE=B(02468ACEGIKMOQSU)
vpaddw ymm5, ymm5, YMMWORD [wk(1)] ; ymm5=(YO+(B-Y)O)=BO=B(13579BDFHJLNPRTV)
vpackuswb ymm4, ymm4, ymm4 ; ymm4=B(02468ACE********GIKMOQSU********)
vpackuswb ymm5, ymm5, ymm5 ; ymm5=B(13579BDF********HJLNPRTV********)
%if RGB_PIXELSIZE == 3 ; ---------------
; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
vpunpcklbw ymmE, ymmE, ymmB ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F
; 2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V)
vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F
; 1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V)
vpsrldq ymmH, ymmA, 2 ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G
; 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --)
vpunpckhwd ymmG, ymmA, ymmE ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F
; 0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V)
vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07
; 0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N)
vpsrldq ymmE, ymmE, 2 ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H
; 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --)
vpsrldq ymmB, ymmD, 2 ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H
; 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --)
vpunpckhwd ymmC, ymmD, ymmH ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G
; 1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --)
vpunpcklwd ymmD, ymmD, ymmH ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18
; 1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O)
vpunpckhwd ymmF, ymmE, ymmB ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H
; 2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --)
vpunpcklwd ymmE, ymmE, ymmB ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29
; 2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P)
vpshufd ymmH, ymmA, 0x4E ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03
; 0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J)
vpunpckldq ymmA, ymmA, ymmD ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14
; 0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K)
vpunpckhdq ymmD, ymmD, ymmE ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29
; 1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P)
vpunpckldq ymmE, ymmE, ymmH ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07
; 2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N)
vpshufd ymmH, ymmG, 0x4E ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B
; 0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R)
vpunpckldq ymmG, ymmG, ymmC ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C
; 0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S)
vpunpckhdq ymmC, ymmC, ymmF ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H
; 1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --)
vpunpckldq ymmF, ymmF, ymmH ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F
; 2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V)
vpunpcklqdq ymmH, ymmA, ymmE ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
vpunpcklqdq ymmG, ymmD, ymmG ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
vpunpcklqdq ymmC, ymmF, ymmC ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
vperm2i128 ymmA, ymmH, ymmG, 0x20 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
vperm2i128 ymmD, ymmC, ymmH, 0x30 ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
vperm2i128 ymmF, ymmG, ymmC, 0x31 ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
cmp ecx, byte SIZEOF_YMMWORD
jb short .column_st64
test edi, SIZEOF_YMMWORD-1
jnz short .out1
; --(aligned)-------------------
vmovntdq YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
vmovntdq YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
vmovntdq YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
jmp short .out0
.out1: ; --(unaligned)-----------------
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
.out0:
add edi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr
sub ecx, byte SIZEOF_YMMWORD
jz near .nextrow
add esi, byte SIZEOF_YMMWORD ; inptr0
add ebx, byte SIZEOF_YMMWORD ; inptr1
add edx, byte SIZEOF_YMMWORD ; inptr2
jmp near .columnloop
alignx 16, 7
.column_st64:
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
cmp ecx, byte 2*SIZEOF_YMMWORD
jb short .column_st32
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
add edi, byte 2*SIZEOF_YMMWORD ; outptr
vmovdqa ymmA, ymmF
sub ecx, byte 2*SIZEOF_YMMWORD
jmp short .column_st31
.column_st32:
cmp ecx, byte SIZEOF_YMMWORD
jb short .column_st31
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
add edi, byte SIZEOF_YMMWORD ; outptr
vmovdqa ymmA, ymmD
sub ecx, byte SIZEOF_YMMWORD
jmp short .column_st31
.column_st31:
cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st15
vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
vperm2i128 ymmA, ymmA, ymmA, 1
sub ecx, byte SIZEOF_XMMWORD
.column_st15:
; Store the lower 8 bytes of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_MMWORD
jb short .column_st7
vmovq XMM_MMWORD [edi], xmmA
add edi, byte SIZEOF_MMWORD
sub ecx, byte SIZEOF_MMWORD
vpsrldq xmmA, xmmA, SIZEOF_MMWORD
.column_st7:
; Store the lower 4 bytes of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_DWORD
jb short .column_st3
vmovd XMM_DWORD [edi], xmmA
add edi, byte SIZEOF_DWORD
sub ecx, byte SIZEOF_DWORD
vpsrldq xmmA, xmmA, SIZEOF_DWORD
.column_st3:
; Store the lower 2 bytes of eax to the output when it has enough
; space.
vmovd eax, xmmA
cmp ecx, byte SIZEOF_WORD
jb short .column_st1
mov word [edi], ax
add edi, byte SIZEOF_WORD
sub ecx, byte SIZEOF_WORD
shr eax, 16
.column_st1:
; Store the lower 1 byte of eax to the output when it has enough
; space.
test ecx, ecx
jz short .nextrow
mov byte [edi], al
%else ; RGB_PIXELSIZE == 4 ; -----------
%ifdef RGBX_FILLER_0XFF
vpcmpeqb ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********)
vpcmpeqb ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********)
%else
vpxor ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********)
vpxor ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********)
%endif
; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **)
; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **)
vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
; 2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U)
vpunpcklbw ymmB, ymmB, ymmD ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
; 0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V)
vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
; 2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V)
vpunpckhwd ymmC, ymmA, ymmE ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E
; 0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U)
vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36
; 0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M)
vpunpckhwd ymmG, ymmB, ymmF ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F
; 0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V)
vpunpcklwd ymmB, ymmB, ymmF ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37
; 0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N)
vpunpckhdq ymmE, ymmA, ymmB ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
vpunpckldq ymmB, ymmA, ymmB ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
vpunpckhdq ymmF, ymmC, ymmG ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
vpunpckldq ymmG, ymmC, ymmG ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
vperm2i128 ymmA, ymmB, ymmE, 0x20 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
vperm2i128 ymmD, ymmG, ymmF, 0x20 ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
vperm2i128 ymmC, ymmB, ymmE, 0x31 ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
vperm2i128 ymmH, ymmG, ymmF, 0x31 ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
cmp ecx, byte SIZEOF_YMMWORD
jb short .column_st64
test edi, SIZEOF_YMMWORD-1
jnz short .out1
; --(aligned)-------------------
vmovntdq YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
vmovntdq YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
vmovntdq YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
vmovntdq YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
jmp short .out0
.out1: ; --(unaligned)-----------------
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
vmovdqu YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
.out0:
add edi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr
sub ecx, byte SIZEOF_YMMWORD
jz near .nextrow
add esi, byte SIZEOF_YMMWORD ; inptr0
add ebx, byte SIZEOF_YMMWORD ; inptr1
add edx, byte SIZEOF_YMMWORD ; inptr2
jmp near .columnloop
alignx 16, 7
.column_st64:
cmp ecx, byte SIZEOF_YMMWORD/2
jb short .column_st32
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
add edi, byte 2*SIZEOF_YMMWORD ; outptr
vmovdqa ymmA, ymmC
vmovdqa ymmD, ymmH
sub ecx, byte SIZEOF_YMMWORD/2
.column_st32:
cmp ecx, byte SIZEOF_YMMWORD/4
jb short .column_st16
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
add edi, byte SIZEOF_YMMWORD ; outptr
vmovdqa ymmA, ymmD
sub ecx, byte SIZEOF_YMMWORD/4
.column_st16:
cmp ecx, byte SIZEOF_YMMWORD/8
jb short .column_st15
vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
vperm2i128 ymmA, ymmA, ymmA, 1
add edi, byte SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_YMMWORD/8
.column_st15:
; Store two pixels (8 bytes) of ymmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_YMMWORD/16
jb short .column_st7
vmovq MMWORD [edi], xmmA
add edi, byte SIZEOF_YMMWORD/16*4
sub ecx, byte SIZEOF_YMMWORD/16
vpsrldq xmmA, SIZEOF_YMMWORD/16*4
.column_st7:
; Store one pixel (4 bytes) of ymmA to the output when it has enough
; space.
test ecx, ecx
jz short .nextrow
vmovd XMM_DWORD [edi], xmmA
%endif ; RGB_PIXELSIZE ; ---------------
alignx 16, 7
.nextrow:
pop ecx
pop esi
pop ebx
pop edx
pop edi
pop eax
add esi, byte SIZEOF_JSAMPROW
add ebx, byte SIZEOF_JSAMPROW
add edx, byte SIZEOF_JSAMPROW
add edi, byte SIZEOF_JSAMPROW ; output_buf
dec eax ; num_rows
jg near .rowloop
sfence ; flush the write buffer
.return:
vzeroupper
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 32

@ -0,0 +1,404 @@
;
; jdcolext.asm - colorspace conversion (MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
%include "jcolsamp.inc"
; --------------------------------------------------------------------------
;
; Convert some rows of samples to the output colorspace.
;
; GLOBAL(void)
; jsimd_ycc_rgb_convert_mmx(JDIMENSION out_width, JSAMPIMAGE input_buf,
; JDIMENSION input_row, JSAMPARRAY output_buf,
; int num_rows)
;
%define out_width(b) (b) + 8 ; JDIMENSION out_width
%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
%define input_row(b) (b) + 16 ; JDIMENSION input_row
%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
%define num_rows(b) (b) + 24 ; int num_rows
%define original_ebp ebp + 0
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
; mmword wk[WK_NUM]
%define WK_NUM 2
%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
align 32
GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_mmx)
EXTN(jsimd_ycc_rgb_convert_mmx):
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic eax ; make a room for GOT address
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
get_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [out_width(eax)] ; num_cols
test ecx, ecx
jz near .return
push ecx
mov edi, JSAMPIMAGE [input_buf(eax)]
mov ecx, JDIMENSION [input_row(eax)]
mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
lea esi, [esi+ecx*SIZEOF_JSAMPROW]
lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
lea edx, [edx+ecx*SIZEOF_JSAMPROW]
pop ecx
mov edi, JSAMPARRAY [output_buf(eax)]
mov eax, INT [num_rows(eax)]
test eax, eax
jle near .return
alignx 16, 7
.rowloop:
push eax
push edi
push edx
push ebx
push esi
push ecx ; col
mov esi, JSAMPROW [esi] ; inptr0
mov ebx, JSAMPROW [ebx] ; inptr1
mov edx, JSAMPROW [edx] ; inptr2
mov edi, JSAMPROW [edi] ; outptr
movpic eax, POINTER [gotptr] ; load GOT address (eax)
alignx 16, 7
.columnloop:
movq mm5, MMWORD [ebx] ; mm5=Cb(01234567)
movq mm1, MMWORD [edx] ; mm1=Cr(01234567)
pcmpeqw mm4, mm4
pcmpeqw mm7, mm7
psrlw mm4, BYTE_BIT
psllw mm7, 7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
movq mm0, mm4 ; mm0=mm4={0xFF 0x00 0xFF 0x00 ..}
pand mm4, mm5 ; mm4=Cb(0246)=CbE
psrlw mm5, BYTE_BIT ; mm5=Cb(1357)=CbO
pand mm0, mm1 ; mm0=Cr(0246)=CrE
psrlw mm1, BYTE_BIT ; mm1=Cr(1357)=CrO
paddw mm4, mm7
paddw mm5, mm7
paddw mm0, mm7
paddw mm1, mm7
; (Original)
; R = Y + 1.40200 * Cr
; G = Y - 0.34414 * Cb - 0.71414 * Cr
; B = Y + 1.77200 * Cb
;
; (This implementation)
; R = Y + 0.40200 * Cr + Cr
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
; B = Y - 0.22800 * Cb + Cb + Cb
movq mm2, mm4 ; mm2=CbE
movq mm3, mm5 ; mm3=CbO
paddw mm4, mm4 ; mm4=2*CbE
paddw mm5, mm5 ; mm5=2*CbO
movq mm6, mm0 ; mm6=CrE
movq mm7, mm1 ; mm7=CrO
paddw mm0, mm0 ; mm0=2*CrE
paddw mm1, mm1 ; mm1=2*CrO
pmulhw mm4, [GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbE * -FIX(0.22800))
pmulhw mm5, [GOTOFF(eax,PW_MF0228)] ; mm5=(2*CbO * -FIX(0.22800))
pmulhw mm0, [GOTOFF(eax,PW_F0402)] ; mm0=(2*CrE * FIX(0.40200))
pmulhw mm1, [GOTOFF(eax,PW_F0402)] ; mm1=(2*CrO * FIX(0.40200))
paddw mm4, [GOTOFF(eax,PW_ONE)]
paddw mm5, [GOTOFF(eax,PW_ONE)]
psraw mm4, 1 ; mm4=(CbE * -FIX(0.22800))
psraw mm5, 1 ; mm5=(CbO * -FIX(0.22800))
paddw mm0, [GOTOFF(eax,PW_ONE)]
paddw mm1, [GOTOFF(eax,PW_ONE)]
psraw mm0, 1 ; mm0=(CrE * FIX(0.40200))
psraw mm1, 1 ; mm1=(CrO * FIX(0.40200))
paddw mm4, mm2
paddw mm5, mm3
paddw mm4, mm2 ; mm4=(CbE * FIX(1.77200))=(B-Y)E
paddw mm5, mm3 ; mm5=(CbO * FIX(1.77200))=(B-Y)O
paddw mm0, mm6 ; mm0=(CrE * FIX(1.40200))=(R-Y)E
paddw mm1, mm7 ; mm1=(CrO * FIX(1.40200))=(R-Y)O
movq MMWORD [wk(0)], mm4 ; wk(0)=(B-Y)E
movq MMWORD [wk(1)], mm5 ; wk(1)=(B-Y)O
movq mm4, mm2
movq mm5, mm3
punpcklwd mm2, mm6
punpckhwd mm4, mm6
pmaddwd mm2, [GOTOFF(eax,PW_MF0344_F0285)]
pmaddwd mm4, [GOTOFF(eax,PW_MF0344_F0285)]
punpcklwd mm3, mm7
punpckhwd mm5, mm7
pmaddwd mm3, [GOTOFF(eax,PW_MF0344_F0285)]
pmaddwd mm5, [GOTOFF(eax,PW_MF0344_F0285)]
paddd mm2, [GOTOFF(eax,PD_ONEHALF)]
paddd mm4, [GOTOFF(eax,PD_ONEHALF)]
psrad mm2, SCALEBITS
psrad mm4, SCALEBITS
paddd mm3, [GOTOFF(eax,PD_ONEHALF)]
paddd mm5, [GOTOFF(eax,PD_ONEHALF)]
psrad mm3, SCALEBITS
psrad mm5, SCALEBITS
packssdw mm2, mm4 ; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
packssdw mm3, mm5 ; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
psubw mm2, mm6 ; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
psubw mm3, mm7 ; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
movq mm5, MMWORD [esi] ; mm5=Y(01234567)
pcmpeqw mm4, mm4
psrlw mm4, BYTE_BIT ; mm4={0xFF 0x00 0xFF 0x00 ..}
pand mm4, mm5 ; mm4=Y(0246)=YE
psrlw mm5, BYTE_BIT ; mm5=Y(1357)=YO
paddw mm0, mm4 ; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6)
paddw mm1, mm5 ; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7)
packuswb mm0, mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **)
packuswb mm1, mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **)
paddw mm2, mm4 ; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6)
paddw mm3, mm5 ; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7)
packuswb mm2, mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **)
packuswb mm3, mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **)
paddw mm4, MMWORD [wk(0)] ; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6)
paddw mm5, MMWORD [wk(1)] ; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7)
packuswb mm4, mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **)
packuswb mm5, mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **)
%if RGB_PIXELSIZE == 3 ; ---------------
; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
punpcklbw mmA, mmC ; mmA=(00 10 02 12 04 14 06 16)
punpcklbw mmE, mmB ; mmE=(20 01 22 03 24 05 26 07)
punpcklbw mmD, mmF ; mmD=(11 21 13 23 15 25 17 27)
movq mmG, mmA
movq mmH, mmA
punpcklwd mmA, mmE ; mmA=(00 10 20 01 02 12 22 03)
punpckhwd mmG, mmE ; mmG=(04 14 24 05 06 16 26 07)
psrlq mmH, 2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --)
psrlq mmE, 2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --)
movq mmC, mmD
movq mmB, mmD
punpcklwd mmD, mmH ; mmD=(11 21 02 12 13 23 04 14)
punpckhwd mmC, mmH ; mmC=(15 25 06 16 17 27 -- --)
psrlq mmB, 2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --)
movq mmF, mmE
punpcklwd mmE, mmB ; mmE=(22 03 13 23 24 05 15 25)
punpckhwd mmF, mmB ; mmF=(26 07 17 27 -- -- -- --)
punpckldq mmA, mmD ; mmA=(00 10 20 01 11 21 02 12)
punpckldq mmE, mmG ; mmE=(22 03 13 23 04 14 24 05)
punpckldq mmC, mmF ; mmC=(15 25 06 16 26 07 17 27)
cmp ecx, byte SIZEOF_MMWORD
jb short .column_st16
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
sub ecx, byte SIZEOF_MMWORD
jz short .nextrow
add esi, byte SIZEOF_MMWORD ; inptr0
add ebx, byte SIZEOF_MMWORD ; inptr1
add edx, byte SIZEOF_MMWORD ; inptr2
add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
jmp near .columnloop
alignx 16, 7
.column_st16:
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
cmp ecx, byte 2*SIZEOF_MMWORD
jb short .column_st8
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
movq mmA, mmC
sub ecx, byte 2*SIZEOF_MMWORD
add edi, byte 2*SIZEOF_MMWORD
jmp short .column_st4
.column_st8:
cmp ecx, byte SIZEOF_MMWORD
jb short .column_st4
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
movq mmA, mmE
sub ecx, byte SIZEOF_MMWORD
add edi, byte SIZEOF_MMWORD
.column_st4:
movd eax, mmA
cmp ecx, byte SIZEOF_DWORD
jb short .column_st2
mov dword [edi+0*SIZEOF_DWORD], eax
psrlq mmA, DWORD_BIT
movd eax, mmA
sub ecx, byte SIZEOF_DWORD
add edi, byte SIZEOF_DWORD
.column_st2:
cmp ecx, byte SIZEOF_WORD
jb short .column_st1
mov word [edi+0*SIZEOF_WORD], ax
shr eax, WORD_BIT
sub ecx, byte SIZEOF_WORD
add edi, byte SIZEOF_WORD
.column_st1:
cmp ecx, byte SIZEOF_BYTE
jb short .nextrow
mov byte [edi+0*SIZEOF_BYTE], al
%else ; RGB_PIXELSIZE == 4 ; -----------
%ifdef RGBX_FILLER_0XFF
pcmpeqb mm6, mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **)
pcmpeqb mm7, mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **)
%else
pxor mm6, mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **)
pxor mm7, mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **)
%endif
; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
punpcklbw mmA, mmC ; mmA=(00 10 02 12 04 14 06 16)
punpcklbw mmE, mmG ; mmE=(20 30 22 32 24 34 26 36)
punpcklbw mmB, mmD ; mmB=(01 11 03 13 05 15 07 17)
punpcklbw mmF, mmH ; mmF=(21 31 23 33 25 35 27 37)
movq mmC, mmA
punpcklwd mmA, mmE ; mmA=(00 10 20 30 02 12 22 32)
punpckhwd mmC, mmE ; mmC=(04 14 24 34 06 16 26 36)
movq mmG, mmB
punpcklwd mmB, mmF ; mmB=(01 11 21 31 03 13 23 33)
punpckhwd mmG, mmF ; mmG=(05 15 25 35 07 17 27 37)
movq mmD, mmA
punpckldq mmA, mmB ; mmA=(00 10 20 30 01 11 21 31)
punpckhdq mmD, mmB ; mmD=(02 12 22 32 03 13 23 33)
movq mmH, mmC
punpckldq mmC, mmG ; mmC=(04 14 24 34 05 15 25 35)
punpckhdq mmH, mmG ; mmH=(06 16 26 36 07 17 27 37)
cmp ecx, byte SIZEOF_MMWORD
jb short .column_st16
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
movq MMWORD [edi+3*SIZEOF_MMWORD], mmH
sub ecx, byte SIZEOF_MMWORD
jz short .nextrow
add esi, byte SIZEOF_MMWORD ; inptr0
add ebx, byte SIZEOF_MMWORD ; inptr1
add edx, byte SIZEOF_MMWORD ; inptr2
add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
jmp near .columnloop
alignx 16, 7
.column_st16:
cmp ecx, byte SIZEOF_MMWORD/2
jb short .column_st8
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
movq mmA, mmC
movq mmD, mmH
sub ecx, byte SIZEOF_MMWORD/2
add edi, byte 2*SIZEOF_MMWORD
.column_st8:
cmp ecx, byte SIZEOF_MMWORD/4
jb short .column_st4
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
movq mmA, mmD
sub ecx, byte SIZEOF_MMWORD/4
add edi, byte 1*SIZEOF_MMWORD
.column_st4:
cmp ecx, byte SIZEOF_MMWORD/8
jb short .nextrow
movd dword [edi+0*SIZEOF_DWORD], mmA
%endif ; RGB_PIXELSIZE ; ---------------
alignx 16, 7
.nextrow:
pop ecx
pop esi
pop ebx
pop edx
pop edi
pop eax
add esi, byte SIZEOF_JSAMPROW
add ebx, byte SIZEOF_JSAMPROW
add edx, byte SIZEOF_JSAMPROW
add edi, byte SIZEOF_JSAMPROW ; output_buf
dec eax ; num_rows
jg near .rowloop
emms ; empty MMX state
.return:
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 32

@ -0,0 +1,458 @@
;
; jdcolext.asm - colorspace conversion (SSE2)
;
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2012, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
%include "jcolsamp.inc"
; --------------------------------------------------------------------------
;
; Convert some rows of samples to the output colorspace.
;
; GLOBAL(void)
; jsimd_ycc_rgb_convert_sse2(JDIMENSION out_width, JSAMPIMAGE input_buf,
; JDIMENSION input_row, JSAMPARRAY output_buf,
; int num_rows)
;
%define out_width(b) (b) + 8 ; JDIMENSION out_width
%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
%define input_row(b) (b) + 16 ; JDIMENSION input_row
%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
%define num_rows(b) (b) + 24 ; int num_rows
%define original_ebp ebp + 0
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
; xmmword wk[WK_NUM]
%define WK_NUM 2
%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
align 32
GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_sse2)
EXTN(jsimd_ycc_rgb_convert_sse2):
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic eax ; make a room for GOT address
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
get_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [out_width(eax)] ; num_cols
test ecx, ecx
jz near .return
push ecx
mov edi, JSAMPIMAGE [input_buf(eax)]
mov ecx, JDIMENSION [input_row(eax)]
mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
lea esi, [esi+ecx*SIZEOF_JSAMPROW]
lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
lea edx, [edx+ecx*SIZEOF_JSAMPROW]
pop ecx
mov edi, JSAMPARRAY [output_buf(eax)]
mov eax, INT [num_rows(eax)]
test eax, eax
jle near .return
alignx 16, 7
.rowloop:
push eax
push edi
push edx
push ebx
push esi
push ecx ; col
mov esi, JSAMPROW [esi] ; inptr0
mov ebx, JSAMPROW [ebx] ; inptr1
mov edx, JSAMPROW [edx] ; inptr2
mov edi, JSAMPROW [edi] ; outptr
movpic eax, POINTER [gotptr] ; load GOT address (eax)
alignx 16, 7
.columnloop:
movdqa xmm5, XMMWORD [ebx] ; xmm5=Cb(0123456789ABCDEF)
movdqa xmm1, XMMWORD [edx] ; xmm1=Cr(0123456789ABCDEF)
pcmpeqw xmm4, xmm4
pcmpeqw xmm7, xmm7
psrlw xmm4, BYTE_BIT
psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
movdqa xmm0, xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
pand xmm4, xmm5 ; xmm4=Cb(02468ACE)=CbE
psrlw xmm5, BYTE_BIT ; xmm5=Cb(13579BDF)=CbO
pand xmm0, xmm1 ; xmm0=Cr(02468ACE)=CrE
psrlw xmm1, BYTE_BIT ; xmm1=Cr(13579BDF)=CrO
paddw xmm4, xmm7
paddw xmm5, xmm7
paddw xmm0, xmm7
paddw xmm1, xmm7
; (Original)
; R = Y + 1.40200 * Cr
; G = Y - 0.34414 * Cb - 0.71414 * Cr
; B = Y + 1.77200 * Cb
;
; (This implementation)
; R = Y + 0.40200 * Cr + Cr
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
; B = Y - 0.22800 * Cb + Cb + Cb
movdqa xmm2, xmm4 ; xmm2=CbE
movdqa xmm3, xmm5 ; xmm3=CbO
paddw xmm4, xmm4 ; xmm4=2*CbE
paddw xmm5, xmm5 ; xmm5=2*CbO
movdqa xmm6, xmm0 ; xmm6=CrE
movdqa xmm7, xmm1 ; xmm7=CrO
paddw xmm0, xmm0 ; xmm0=2*CrE
paddw xmm1, xmm1 ; xmm1=2*CrO
pmulhw xmm4, [GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbE * -FIX(0.22800))
pmulhw xmm5, [GOTOFF(eax,PW_MF0228)] ; xmm5=(2*CbO * -FIX(0.22800))
pmulhw xmm0, [GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrE * FIX(0.40200))
pmulhw xmm1, [GOTOFF(eax,PW_F0402)] ; xmm1=(2*CrO * FIX(0.40200))
paddw xmm4, [GOTOFF(eax,PW_ONE)]
paddw xmm5, [GOTOFF(eax,PW_ONE)]
psraw xmm4, 1 ; xmm4=(CbE * -FIX(0.22800))
psraw xmm5, 1 ; xmm5=(CbO * -FIX(0.22800))
paddw xmm0, [GOTOFF(eax,PW_ONE)]
paddw xmm1, [GOTOFF(eax,PW_ONE)]
psraw xmm0, 1 ; xmm0=(CrE * FIX(0.40200))
psraw xmm1, 1 ; xmm1=(CrO * FIX(0.40200))
paddw xmm4, xmm2
paddw xmm5, xmm3
paddw xmm4, xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
paddw xmm5, xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
paddw xmm0, xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
paddw xmm1, xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O
movdqa xmm4, xmm2
movdqa xmm5, xmm3
punpcklwd xmm2, xmm6
punpckhwd xmm4, xmm6
pmaddwd xmm2, [GOTOFF(eax,PW_MF0344_F0285)]
pmaddwd xmm4, [GOTOFF(eax,PW_MF0344_F0285)]
punpcklwd xmm3, xmm7
punpckhwd xmm5, xmm7
pmaddwd xmm3, [GOTOFF(eax,PW_MF0344_F0285)]
pmaddwd xmm5, [GOTOFF(eax,PW_MF0344_F0285)]
paddd xmm2, [GOTOFF(eax,PD_ONEHALF)]
paddd xmm4, [GOTOFF(eax,PD_ONEHALF)]
psrad xmm2, SCALEBITS
psrad xmm4, SCALEBITS
paddd xmm3, [GOTOFF(eax,PD_ONEHALF)]
paddd xmm5, [GOTOFF(eax,PD_ONEHALF)]
psrad xmm3, SCALEBITS
psrad xmm5, SCALEBITS
packssdw xmm2, xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
packssdw xmm3, xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
psubw xmm2, xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
psubw xmm3, xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
movdqa xmm5, XMMWORD [esi] ; xmm5=Y(0123456789ABCDEF)
pcmpeqw xmm4, xmm4
psrlw xmm4, BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..}
pand xmm4, xmm5 ; xmm4=Y(02468ACE)=YE
psrlw xmm5, BYTE_BIT ; xmm5=Y(13579BDF)=YO
paddw xmm0, xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
paddw xmm1, xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
packuswb xmm0, xmm0 ; xmm0=R(02468ACE********)
packuswb xmm1, xmm1 ; xmm1=R(13579BDF********)
paddw xmm2, xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
paddw xmm3, xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
packuswb xmm2, xmm2 ; xmm2=G(02468ACE********)
packuswb xmm3, xmm3 ; xmm3=G(13579BDF********)
paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
packuswb xmm4, xmm4 ; xmm4=B(02468ACE********)
packuswb xmm5, xmm5 ; xmm5=B(13579BDF********)
%if RGB_PIXELSIZE == 3 ; ---------------
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
movdqa xmmG, xmmA
movdqa xmmH, xmmA
punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
movdqa xmmC, xmmD
movdqa xmmB, xmmD
punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
movdqa xmmF, xmmE
punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
movdqa xmmB, xmmE
punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
movdqa xmmB, xmmF
punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st32
test edi, SIZEOF_XMMWORD-1
jnz short .out1
; --(aligned)-------------------
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
jmp short .out0
.out1: ; --(unaligned)-----------------
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
.out0:
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD
jz near .nextrow
add esi, byte SIZEOF_XMMWORD ; inptr0
add ebx, byte SIZEOF_XMMWORD ; inptr1
add edx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop
alignx 16, 7
.column_st32:
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
cmp ecx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA, xmmF
sub ecx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15
.column_st16:
cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st15
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA, xmmD
sub ecx, byte SIZEOF_XMMWORD
.column_st15:
; Store the lower 8 bytes of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_MMWORD
jb short .column_st7
movq XMM_MMWORD [edi], xmmA
add edi, byte SIZEOF_MMWORD
sub ecx, byte SIZEOF_MMWORD
psrldq xmmA, SIZEOF_MMWORD
.column_st7:
; Store the lower 4 bytes of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_DWORD
jb short .column_st3
movd XMM_DWORD [edi], xmmA
add edi, byte SIZEOF_DWORD
sub ecx, byte SIZEOF_DWORD
psrldq xmmA, SIZEOF_DWORD
.column_st3:
; Store the lower 2 bytes of eax to the output when it has enough
; space.
movd eax, xmmA
cmp ecx, byte SIZEOF_WORD
jb short .column_st1
mov word [edi], ax
add edi, byte SIZEOF_WORD
sub ecx, byte SIZEOF_WORD
shr eax, 16
.column_st1:
; Store the lower 1 byte of eax to the output when it has enough
; space.
test ecx, ecx
jz short .nextrow
mov byte [edi], al
%else ; RGB_PIXELSIZE == 4 ; -----------
%ifdef RGBX_FILLER_0XFF
pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%else
pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%endif
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
movdqa xmmC, xmmA
punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
movdqa xmmG, xmmB
punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
movdqa xmmD, xmmA
punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
movdqa xmmH, xmmC
punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st32
test edi, SIZEOF_XMMWORD-1
jnz short .out1
; --(aligned)-------------------
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
jmp short .out0
.out1: ; --(unaligned)-----------------
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
.out0:
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD
jz near .nextrow
add esi, byte SIZEOF_XMMWORD ; inptr0
add ebx, byte SIZEOF_XMMWORD ; inptr1
add edx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop
alignx 16, 7
.column_st32:
cmp ecx, byte SIZEOF_XMMWORD/2
jb short .column_st16
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA, xmmC
movdqa xmmD, xmmH
sub ecx, byte SIZEOF_XMMWORD/2
.column_st16:
cmp ecx, byte SIZEOF_XMMWORD/4
jb short .column_st15
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA, xmmD
sub ecx, byte SIZEOF_XMMWORD/4
.column_st15:
; Store two pixels (8 bytes) of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_XMMWORD/8
jb short .column_st7
movq XMM_MMWORD [edi], xmmA
add edi, byte SIZEOF_XMMWORD/8*4
sub ecx, byte SIZEOF_XMMWORD/8
psrldq xmmA, SIZEOF_XMMWORD/8*4
.column_st7:
; Store one pixel (4 bytes) of xmmA to the output when it has enough
; space.
test ecx, ecx
jz short .nextrow
movd XMM_DWORD [edi], xmmA
%endif ; RGB_PIXELSIZE ; ---------------
alignx 16, 7
.nextrow:
pop ecx
pop esi
pop ebx
pop edx
pop edi
pop eax
add esi, byte SIZEOF_JSAMPROW
add ebx, byte SIZEOF_JSAMPROW
add edx, byte SIZEOF_JSAMPROW
add edi, byte SIZEOF_JSAMPROW ; output_buf
dec eax ; num_rows
jg near .rowloop
sfence ; flush the write buffer
.return:
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 32

@ -0,0 +1,118 @@
;
; jdcolor.asm - colorspace conversion (AVX2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2015, Intel Corporation.
; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
%include "jsimdext.inc"
; --------------------------------------------------------------------------
%define SCALEBITS 16
F_0_344 equ 22554 ; FIX(0.34414)
F_0_714 equ 46802 ; FIX(0.71414)
F_1_402 equ 91881 ; FIX(1.40200)
F_1_772 equ 116130 ; FIX(1.77200)
F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
; --------------------------------------------------------------------------
SECTION SEG_CONST
alignz 32
GLOBAL_DATA(jconst_ycc_rgb_convert_avx2)
EXTN(jconst_ycc_rgb_convert_avx2):
PW_F0402 times 16 dw F_0_402
PW_MF0228 times 16 dw -F_0_228
PW_MF0344_F0285 times 8 dw -F_0_344, F_0_285
PW_ONE times 16 dw 1
PD_ONEHALF times 8 dd 1 << (SCALEBITS - 1)
alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
%include "jdcolext-avx2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_RGB_RED
%define RGB_GREEN EXT_RGB_GREEN
%define RGB_BLUE EXT_RGB_BLUE
%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extrgb_convert_avx2
%include "jdcolext-avx2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_RGBX_RED
%define RGB_GREEN EXT_RGBX_GREEN
%define RGB_BLUE EXT_RGBX_BLUE
%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extrgbx_convert_avx2
%include "jdcolext-avx2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_BGR_RED
%define RGB_GREEN EXT_BGR_GREEN
%define RGB_BLUE EXT_BGR_BLUE
%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extbgr_convert_avx2
%include "jdcolext-avx2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_BGRX_RED
%define RGB_GREEN EXT_BGRX_GREEN
%define RGB_BLUE EXT_BGRX_BLUE
%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extbgrx_convert_avx2
%include "jdcolext-avx2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_XBGR_RED
%define RGB_GREEN EXT_XBGR_GREEN
%define RGB_BLUE EXT_XBGR_BLUE
%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extxbgr_convert_avx2
%include "jdcolext-avx2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_XRGB_RED
%define RGB_GREEN EXT_XRGB_GREEN
%define RGB_BLUE EXT_XRGB_BLUE
%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extxrgb_convert_avx2
%include "jdcolext-avx2.asm"

@ -0,0 +1,117 @@
;
; jdcolor.asm - colorspace conversion (MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
%include "jsimdext.inc"
; --------------------------------------------------------------------------
%define SCALEBITS 16
F_0_344 equ 22554 ; FIX(0.34414)
F_0_714 equ 46802 ; FIX(0.71414)
F_1_402 equ 91881 ; FIX(1.40200)
F_1_772 equ 116130 ; FIX(1.77200)
F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
; --------------------------------------------------------------------------
SECTION SEG_CONST
alignz 32
GLOBAL_DATA(jconst_ycc_rgb_convert_mmx)
EXTN(jconst_ycc_rgb_convert_mmx):
PW_F0402 times 4 dw F_0_402
PW_MF0228 times 4 dw -F_0_228
PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285
PW_ONE times 4 dw 1
PD_ONEHALF times 2 dd 1 << (SCALEBITS - 1)
alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
%include "jdcolext-mmx.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_RGB_RED
%define RGB_GREEN EXT_RGB_GREEN
%define RGB_BLUE EXT_RGB_BLUE
%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgb_convert_mmx
%include "jdcolext-mmx.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_RGBX_RED
%define RGB_GREEN EXT_RGBX_GREEN
%define RGB_BLUE EXT_RGBX_BLUE
%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgbx_convert_mmx
%include "jdcolext-mmx.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_BGR_RED
%define RGB_GREEN EXT_BGR_GREEN
%define RGB_BLUE EXT_BGR_BLUE
%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgr_convert_mmx
%include "jdcolext-mmx.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_BGRX_RED
%define RGB_GREEN EXT_BGRX_GREEN
%define RGB_BLUE EXT_BGRX_BLUE
%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgrx_convert_mmx
%include "jdcolext-mmx.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_XBGR_RED
%define RGB_GREEN EXT_XBGR_GREEN
%define RGB_BLUE EXT_XBGR_BLUE
%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxbgr_convert_mmx
%include "jdcolext-mmx.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_XRGB_RED
%define RGB_GREEN EXT_XRGB_GREEN
%define RGB_BLUE EXT_XRGB_BLUE
%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxrgb_convert_mmx
%include "jdcolext-mmx.asm"

@ -0,0 +1,117 @@
;
; jdcolor.asm - colorspace conversion (SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
%include "jsimdext.inc"
; --------------------------------------------------------------------------
%define SCALEBITS 16
F_0_344 equ 22554 ; FIX(0.34414)
F_0_714 equ 46802 ; FIX(0.71414)
F_1_402 equ 91881 ; FIX(1.40200)
F_1_772 equ 116130 ; FIX(1.77200)
F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
; --------------------------------------------------------------------------
SECTION SEG_CONST
alignz 32
GLOBAL_DATA(jconst_ycc_rgb_convert_sse2)
EXTN(jconst_ycc_rgb_convert_sse2):
PW_F0402 times 8 dw F_0_402
PW_MF0228 times 8 dw -F_0_228
PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
PW_ONE times 8 dw 1
PD_ONEHALF times 4 dd 1 << (SCALEBITS - 1)
alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
%include "jdcolext-sse2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_RGB_RED
%define RGB_GREEN EXT_RGB_GREEN
%define RGB_BLUE EXT_RGB_BLUE
%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2
%include "jdcolext-sse2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_RGBX_RED
%define RGB_GREEN EXT_RGBX_GREEN
%define RGB_BLUE EXT_RGBX_BLUE
%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2
%include "jdcolext-sse2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_BGR_RED
%define RGB_GREEN EXT_BGR_GREEN
%define RGB_BLUE EXT_BGR_BLUE
%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2
%include "jdcolext-sse2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_BGRX_RED
%define RGB_GREEN EXT_BGRX_GREEN
%define RGB_BLUE EXT_BGRX_BLUE
%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2
%include "jdcolext-sse2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_XBGR_RED
%define RGB_GREEN EXT_XBGR_GREEN
%define RGB_BLUE EXT_XBGR_BLUE
%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2
%include "jdcolext-sse2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_XRGB_RED
%define RGB_GREEN EXT_XRGB_GREEN
%define RGB_BLUE EXT_XRGB_BLUE
%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2
%include "jdcolext-sse2.asm"

@ -0,0 +1,136 @@
;
; jdmerge.asm - merged upsampling/color conversion (AVX2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander.
; Copyright (C) 2015, Intel Corporation.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
%include "jsimdext.inc"
; --------------------------------------------------------------------------
%define SCALEBITS 16
F_0_344 equ 22554 ; FIX(0.34414)
F_0_714 equ 46802 ; FIX(0.71414)
F_1_402 equ 91881 ; FIX(1.40200)
F_1_772 equ 116130 ; FIX(1.77200)
F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
; --------------------------------------------------------------------------
SECTION SEG_CONST
alignz 32
GLOBAL_DATA(jconst_merged_upsample_avx2)
EXTN(jconst_merged_upsample_avx2):
PW_F0402 times 16 dw F_0_402
PW_MF0228 times 16 dw -F_0_228
PW_MF0344_F0285 times 8 dw -F_0_344, F_0_285
PW_ONE times 16 dw 1
PD_ONEHALF times 8 dd 1 << (SCALEBITS - 1)
alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
%include "jdmrgext-avx2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_RGB_RED
%define RGB_GREEN EXT_RGB_GREEN
%define RGB_BLUE EXT_RGB_BLUE
%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
%define jsimd_h2v1_merged_upsample_avx2 \
jsimd_h2v1_extrgb_merged_upsample_avx2
%define jsimd_h2v2_merged_upsample_avx2 \
jsimd_h2v2_extrgb_merged_upsample_avx2
%include "jdmrgext-avx2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_RGBX_RED
%define RGB_GREEN EXT_RGBX_GREEN
%define RGB_BLUE EXT_RGBX_BLUE
%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
%define jsimd_h2v1_merged_upsample_avx2 \
jsimd_h2v1_extrgbx_merged_upsample_avx2
%define jsimd_h2v2_merged_upsample_avx2 \
jsimd_h2v2_extrgbx_merged_upsample_avx2
%include "jdmrgext-avx2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_BGR_RED
%define RGB_GREEN EXT_BGR_GREEN
%define RGB_BLUE EXT_BGR_BLUE
%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
%define jsimd_h2v1_merged_upsample_avx2 \
jsimd_h2v1_extbgr_merged_upsample_avx2
%define jsimd_h2v2_merged_upsample_avx2 \
jsimd_h2v2_extbgr_merged_upsample_avx2
%include "jdmrgext-avx2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_BGRX_RED
%define RGB_GREEN EXT_BGRX_GREEN
%define RGB_BLUE EXT_BGRX_BLUE
%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
%define jsimd_h2v1_merged_upsample_avx2 \
jsimd_h2v1_extbgrx_merged_upsample_avx2
%define jsimd_h2v2_merged_upsample_avx2 \
jsimd_h2v2_extbgrx_merged_upsample_avx2
%include "jdmrgext-avx2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_XBGR_RED
%define RGB_GREEN EXT_XBGR_GREEN
%define RGB_BLUE EXT_XBGR_BLUE
%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
%define jsimd_h2v1_merged_upsample_avx2 \
jsimd_h2v1_extxbgr_merged_upsample_avx2
%define jsimd_h2v2_merged_upsample_avx2 \
jsimd_h2v2_extxbgr_merged_upsample_avx2
%include "jdmrgext-avx2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_XRGB_RED
%define RGB_GREEN EXT_XRGB_GREEN
%define RGB_BLUE EXT_XRGB_BLUE
%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
%define jsimd_h2v1_merged_upsample_avx2 \
jsimd_h2v1_extxrgb_merged_upsample_avx2
%define jsimd_h2v2_merged_upsample_avx2 \
jsimd_h2v2_extxrgb_merged_upsample_avx2
%include "jdmrgext-avx2.asm"

@ -0,0 +1,123 @@
;
; jdmerge.asm - merged upsampling/color conversion (MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
%include "jsimdext.inc"
; --------------------------------------------------------------------------
%define SCALEBITS 16
F_0_344 equ 22554 ; FIX(0.34414)
F_0_714 equ 46802 ; FIX(0.71414)
F_1_402 equ 91881 ; FIX(1.40200)
F_1_772 equ 116130 ; FIX(1.77200)
F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
; --------------------------------------------------------------------------
SECTION SEG_CONST
alignz 32
GLOBAL_DATA(jconst_merged_upsample_mmx)
EXTN(jconst_merged_upsample_mmx):
PW_F0402 times 4 dw F_0_402
PW_MF0228 times 4 dw -F_0_228
PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285
PW_ONE times 4 dw 1
PD_ONEHALF times 2 dd 1 << (SCALEBITS - 1)
alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
%include "jdmrgext-mmx.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_RGB_RED
%define RGB_GREEN EXT_RGB_GREEN
%define RGB_BLUE EXT_RGB_BLUE
%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extrgb_merged_upsample_mmx
%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extrgb_merged_upsample_mmx
%include "jdmrgext-mmx.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_RGBX_RED
%define RGB_GREEN EXT_RGBX_GREEN
%define RGB_BLUE EXT_RGBX_BLUE
%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extrgbx_merged_upsample_mmx
%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extrgbx_merged_upsample_mmx
%include "jdmrgext-mmx.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_BGR_RED
%define RGB_GREEN EXT_BGR_GREEN
%define RGB_BLUE EXT_BGR_BLUE
%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extbgr_merged_upsample_mmx
%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extbgr_merged_upsample_mmx
%include "jdmrgext-mmx.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_BGRX_RED
%define RGB_GREEN EXT_BGRX_GREEN
%define RGB_BLUE EXT_BGRX_BLUE
%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extbgrx_merged_upsample_mmx
%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extbgrx_merged_upsample_mmx
%include "jdmrgext-mmx.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_XBGR_RED
%define RGB_GREEN EXT_XBGR_GREEN
%define RGB_BLUE EXT_XBGR_BLUE
%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extxbgr_merged_upsample_mmx
%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extxbgr_merged_upsample_mmx
%include "jdmrgext-mmx.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_XRGB_RED
%define RGB_GREEN EXT_XRGB_GREEN
%define RGB_BLUE EXT_XRGB_BLUE
%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extxrgb_merged_upsample_mmx
%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extxrgb_merged_upsample_mmx
%include "jdmrgext-mmx.asm"

@ -0,0 +1,135 @@
;
; jdmerge.asm - merged upsampling/color conversion (SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
%include "jsimdext.inc"
; --------------------------------------------------------------------------
%define SCALEBITS 16
F_0_344 equ 22554 ; FIX(0.34414)
F_0_714 equ 46802 ; FIX(0.71414)
F_1_402 equ 91881 ; FIX(1.40200)
F_1_772 equ 116130 ; FIX(1.77200)
F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
; --------------------------------------------------------------------------
SECTION SEG_CONST
alignz 32
GLOBAL_DATA(jconst_merged_upsample_sse2)
EXTN(jconst_merged_upsample_sse2):
PW_F0402 times 8 dw F_0_402
PW_MF0228 times 8 dw -F_0_228
PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
PW_ONE times 8 dw 1
PD_ONEHALF times 4 dd 1 << (SCALEBITS - 1)
alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
%include "jdmrgext-sse2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_RGB_RED
%define RGB_GREEN EXT_RGB_GREEN
%define RGB_BLUE EXT_RGB_BLUE
%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
%define jsimd_h2v1_merged_upsample_sse2 \
jsimd_h2v1_extrgb_merged_upsample_sse2
%define jsimd_h2v2_merged_upsample_sse2 \
jsimd_h2v2_extrgb_merged_upsample_sse2
%include "jdmrgext-sse2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_RGBX_RED
%define RGB_GREEN EXT_RGBX_GREEN
%define RGB_BLUE EXT_RGBX_BLUE
%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
%define jsimd_h2v1_merged_upsample_sse2 \
jsimd_h2v1_extrgbx_merged_upsample_sse2
%define jsimd_h2v2_merged_upsample_sse2 \
jsimd_h2v2_extrgbx_merged_upsample_sse2
%include "jdmrgext-sse2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_BGR_RED
%define RGB_GREEN EXT_BGR_GREEN
%define RGB_BLUE EXT_BGR_BLUE
%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
%define jsimd_h2v1_merged_upsample_sse2 \
jsimd_h2v1_extbgr_merged_upsample_sse2
%define jsimd_h2v2_merged_upsample_sse2 \
jsimd_h2v2_extbgr_merged_upsample_sse2
%include "jdmrgext-sse2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_BGRX_RED
%define RGB_GREEN EXT_BGRX_GREEN
%define RGB_BLUE EXT_BGRX_BLUE
%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
%define jsimd_h2v1_merged_upsample_sse2 \
jsimd_h2v1_extbgrx_merged_upsample_sse2
%define jsimd_h2v2_merged_upsample_sse2 \
jsimd_h2v2_extbgrx_merged_upsample_sse2
%include "jdmrgext-sse2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_XBGR_RED
%define RGB_GREEN EXT_XBGR_GREEN
%define RGB_BLUE EXT_XBGR_BLUE
%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
%define jsimd_h2v1_merged_upsample_sse2 \
jsimd_h2v1_extxbgr_merged_upsample_sse2
%define jsimd_h2v2_merged_upsample_sse2 \
jsimd_h2v2_extxbgr_merged_upsample_sse2
%include "jdmrgext-sse2.asm"
%undef RGB_RED
%undef RGB_GREEN
%undef RGB_BLUE
%undef RGB_PIXELSIZE
%define RGB_RED EXT_XRGB_RED
%define RGB_GREEN EXT_XRGB_GREEN
%define RGB_BLUE EXT_XRGB_BLUE
%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
%define jsimd_h2v1_merged_upsample_sse2 \
jsimd_h2v1_extxrgb_merged_upsample_sse2
%define jsimd_h2v2_merged_upsample_sse2 \
jsimd_h2v2_extxrgb_merged_upsample_sse2
%include "jdmrgext-sse2.asm"

@ -0,0 +1,575 @@
;
; jdmrgext.asm - merged upsampling/color conversion (AVX2)
;
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2012, 2016, D. R. Commander.
; Copyright (C) 2015, Intel Corporation.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
%include "jcolsamp.inc"
; --------------------------------------------------------------------------
;
; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
;
; GLOBAL(void)
; jsimd_h2v1_merged_upsample_avx2(JDIMENSION output_width,
; JSAMPIMAGE input_buf,
; JDIMENSION in_row_group_ctr,
; JSAMPARRAY output_buf);
;
%define output_width(b) (b) + 8 ; JDIMENSION output_width
%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr
%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
%define original_ebp ebp + 0
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
; ymmword wk[WK_NUM]
%define WK_NUM 3
%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
align 32
GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_avx2)
EXTN(jsimd_h2v1_merged_upsample_avx2):
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic eax ; make a room for GOT address
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
get_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [output_width(eax)] ; col
test ecx, ecx
jz near .return
push ecx
mov edi, JSAMPIMAGE [input_buf(eax)]
mov ecx, JDIMENSION [in_row_group_ctr(eax)]
mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
mov edi, JSAMPARRAY [output_buf(eax)]
mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0
mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1
mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2
mov edi, JSAMPROW [edi] ; outptr
pop ecx ; col
alignx 16, 7
.columnloop:
movpic eax, POINTER [gotptr] ; load GOT address (eax)
vmovdqu ymm6, YMMWORD [ebx] ; ymm6=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
vmovdqu ymm7, YMMWORD [edx] ; ymm7=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's)
vpcmpeqw ymm3, ymm3, ymm3
vpsllw ymm3, ymm3, 7 ; ymm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
vpermq ymm6, ymm6, 0xd8 ; ymm6=Cb(01234567GHIJKLMN89ABCDEFOPQRSTUV)
vpermq ymm7, ymm7, 0xd8 ; ymm7=Cr(01234567GHIJKLMN89ABCDEFOPQRSTUV)
vpunpcklbw ymm4, ymm6, ymm1 ; ymm4=Cb(0123456789ABCDEF)=CbL
vpunpckhbw ymm6, ymm6, ymm1 ; ymm6=Cb(GHIJKLMNOPQRSTUV)=CbH
vpunpcklbw ymm0, ymm7, ymm1 ; ymm0=Cr(0123456789ABCDEF)=CrL
vpunpckhbw ymm7, ymm7, ymm1 ; ymm7=Cr(GHIJKLMNOPQRSTUV)=CrH
vpaddw ymm5, ymm6, ymm3
vpaddw ymm2, ymm4, ymm3
vpaddw ymm1, ymm7, ymm3
vpaddw ymm3, ymm0, ymm3
; (Original)
; R = Y + 1.40200 * Cr
; G = Y - 0.34414 * Cb - 0.71414 * Cr
; B = Y + 1.77200 * Cb
;
; (This implementation)
; R = Y + 0.40200 * Cr + Cr
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
; B = Y - 0.22800 * Cb + Cb + Cb
vpaddw ymm6, ymm5, ymm5 ; ymm6=2*CbH
vpaddw ymm4, ymm2, ymm2 ; ymm4=2*CbL
vpaddw ymm7, ymm1, ymm1 ; ymm7=2*CrH
vpaddw ymm0, ymm3, ymm3 ; ymm0=2*CrL
vpmulhw ymm6, ymm6, [GOTOFF(eax,PW_MF0228)] ; ymm6=(2*CbH * -FIX(0.22800))
vpmulhw ymm4, ymm4, [GOTOFF(eax,PW_MF0228)] ; ymm4=(2*CbL * -FIX(0.22800))
vpmulhw ymm7, ymm7, [GOTOFF(eax,PW_F0402)] ; ymm7=(2*CrH * FIX(0.40200))
vpmulhw ymm0, ymm0, [GOTOFF(eax,PW_F0402)] ; ymm0=(2*CrL * FIX(0.40200))
vpaddw ymm6, ymm6, [GOTOFF(eax,PW_ONE)]
vpaddw ymm4, ymm4, [GOTOFF(eax,PW_ONE)]
vpsraw ymm6, ymm6, 1 ; ymm6=(CbH * -FIX(0.22800))
vpsraw ymm4, ymm4, 1 ; ymm4=(CbL * -FIX(0.22800))
vpaddw ymm7, ymm7, [GOTOFF(eax,PW_ONE)]
vpaddw ymm0, ymm0, [GOTOFF(eax,PW_ONE)]
vpsraw ymm7, ymm7, 1 ; ymm7=(CrH * FIX(0.40200))
vpsraw ymm0, ymm0, 1 ; ymm0=(CrL * FIX(0.40200))
vpaddw ymm6, ymm6, ymm5
vpaddw ymm4, ymm4, ymm2
vpaddw ymm6, ymm6, ymm5 ; ymm6=(CbH * FIX(1.77200))=(B-Y)H
vpaddw ymm4, ymm4, ymm2 ; ymm4=(CbL * FIX(1.77200))=(B-Y)L
vpaddw ymm7, ymm7, ymm1 ; ymm7=(CrH * FIX(1.40200))=(R-Y)H
vpaddw ymm0, ymm0, ymm3 ; ymm0=(CrL * FIX(1.40200))=(R-Y)L
vmovdqa YMMWORD [wk(0)], ymm6 ; wk(0)=(B-Y)H
vmovdqa YMMWORD [wk(1)], ymm7 ; wk(1)=(R-Y)H
vpunpckhwd ymm6, ymm5, ymm1
vpunpcklwd ymm5, ymm5, ymm1
vpmaddwd ymm5, ymm5, [GOTOFF(eax,PW_MF0344_F0285)]
vpmaddwd ymm6, ymm6, [GOTOFF(eax,PW_MF0344_F0285)]
vpunpckhwd ymm7, ymm2, ymm3
vpunpcklwd ymm2, ymm2, ymm3
vpmaddwd ymm2, ymm2, [GOTOFF(eax,PW_MF0344_F0285)]
vpmaddwd ymm7, ymm7, [GOTOFF(eax,PW_MF0344_F0285)]
vpaddd ymm5, ymm5, [GOTOFF(eax,PD_ONEHALF)]
vpaddd ymm6, ymm6, [GOTOFF(eax,PD_ONEHALF)]
vpsrad ymm5, ymm5, SCALEBITS
vpsrad ymm6, ymm6, SCALEBITS
vpaddd ymm2, ymm2, [GOTOFF(eax,PD_ONEHALF)]
vpaddd ymm7, ymm7, [GOTOFF(eax,PD_ONEHALF)]
vpsrad ymm2, ymm2, SCALEBITS
vpsrad ymm7, ymm7, SCALEBITS
vpackssdw ymm5, ymm5, ymm6 ; ymm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
vpackssdw ymm2, ymm2, ymm7 ; ymm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
vpsubw ymm5, ymm5, ymm1 ; ymm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
vpsubw ymm2, ymm2, ymm3 ; ymm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
vmovdqa YMMWORD [wk(2)], ymm5 ; wk(2)=(G-Y)H
mov al, 2 ; Yctr
jmp short .Yloop_1st
alignx 16, 7
.Yloop_2nd:
vmovdqa ymm0, YMMWORD [wk(1)] ; ymm0=(R-Y)H
vmovdqa ymm2, YMMWORD [wk(2)] ; ymm2=(G-Y)H
vmovdqa ymm4, YMMWORD [wk(0)] ; ymm4=(B-Y)H
alignx 16, 7
.Yloop_1st:
vmovdqu ymm7, YMMWORD [esi] ; ymm7=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
vpcmpeqw ymm6, ymm6, ymm6
vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..}
vpand ymm6, ymm6, ymm7 ; ymm6=Y(02468ACEGIKMOQSU)=YE
vpsrlw ymm7, ymm7, BYTE_BIT ; ymm7=Y(13579BDFHJLNPRTV)=YO
vmovdqa ymm1, ymm0 ; ymm1=ymm0=(R-Y)(L/H)
vmovdqa ymm3, ymm2 ; ymm3=ymm2=(G-Y)(L/H)
vmovdqa ymm5, ymm4 ; ymm5=ymm4=(B-Y)(L/H)
vpaddw ymm0, ymm0, ymm6 ; ymm0=((R-Y)+YE)=RE=R(02468ACEGIKMOQSU)
vpaddw ymm1, ymm1, ymm7 ; ymm1=((R-Y)+YO)=RO=R(13579BDFHJLNPRTV)
vpackuswb ymm0, ymm0, ymm0 ; ymm0=R(02468ACE********GIKMOQSU********)
vpackuswb ymm1, ymm1, ymm1 ; ymm1=R(13579BDF********HJLNPRTV********)
vpaddw ymm2, ymm2, ymm6 ; ymm2=((G-Y)+YE)=GE=G(02468ACEGIKMOQSU)
vpaddw ymm3, ymm3, ymm7 ; ymm3=((G-Y)+YO)=GO=G(13579BDFHJLNPRTV)
vpackuswb ymm2, ymm2, ymm2 ; ymm2=G(02468ACE********GIKMOQSU********)
vpackuswb ymm3, ymm3, ymm3 ; ymm3=G(13579BDF********HJLNPRTV********)
vpaddw ymm4, ymm4, ymm6 ; ymm4=((B-Y)+YE)=BE=B(02468ACEGIKMOQSU)
vpaddw ymm5, ymm5, ymm7 ; ymm5=((B-Y)+YO)=BO=B(13579BDFHJLNPRTV)
vpackuswb ymm4, ymm4, ymm4 ; ymm4=B(02468ACE********GIKMOQSU********)
vpackuswb ymm5, ymm5, ymm5 ; ymm5=B(13579BDF********HJLNPRTV********)
%if RGB_PIXELSIZE == 3 ; ---------------
; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
vpunpcklbw ymmE, ymmE, ymmB ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F
; 2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V)
vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F
; 1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V)
vpsrldq ymmH, ymmA, 2 ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G
; 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --)
vpunpckhwd ymmG, ymmA, ymmE ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F
; 0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V)
vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07
; 0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N)
vpsrldq ymmE, ymmE, 2 ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H
; 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --)
vpsrldq ymmB, ymmD, 2 ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H
; 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --)
vpunpckhwd ymmC, ymmD, ymmH ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G
; 1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --)
vpunpcklwd ymmD, ymmD, ymmH ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18
; 1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O)
vpunpckhwd ymmF, ymmE, ymmB ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H
; 2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --)
vpunpcklwd ymmE, ymmE, ymmB ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29
; 2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P)
vpshufd ymmH, ymmA, 0x4E ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03
; 0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J)
vpunpckldq ymmA, ymmA, ymmD ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14
; 0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K)
vpunpckhdq ymmD, ymmD, ymmE ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29
; 1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P)
vpunpckldq ymmE, ymmE, ymmH ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07
; 2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N)
vpshufd ymmH, ymmG, 0x4E ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B
; 0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R)
vpunpckldq ymmG, ymmG, ymmC ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C
; 0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S)
vpunpckhdq ymmC, ymmC, ymmF ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H
; 1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --)
vpunpckldq ymmF, ymmF, ymmH ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F
; 2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V)
vpunpcklqdq ymmH, ymmA, ymmE ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
vpunpcklqdq ymmG, ymmD, ymmG ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
vpunpcklqdq ymmC, ymmF, ymmC ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
vperm2i128 ymmA, ymmH, ymmG, 0x20 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
vperm2i128 ymmD, ymmC, ymmH, 0x30 ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
vperm2i128 ymmF, ymmG, ymmC, 0x31 ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
cmp ecx, byte SIZEOF_YMMWORD
jb short .column_st64
test edi, SIZEOF_YMMWORD-1
jnz short .out1
; --(aligned)-------------------
vmovntdq YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
vmovntdq YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
vmovntdq YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
jmp short .out0
.out1: ; --(unaligned)-----------------
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
.out0:
add edi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr
sub ecx, byte SIZEOF_YMMWORD
jz near .endcolumn
add esi, byte SIZEOF_YMMWORD ; inptr0
dec al ; Yctr
jnz near .Yloop_2nd
add ebx, byte SIZEOF_YMMWORD ; inptr1
add edx, byte SIZEOF_YMMWORD ; inptr2
jmp near .columnloop
alignx 16, 7
.column_st64:
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
cmp ecx, byte 2*SIZEOF_YMMWORD
jb short .column_st32
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
add edi, byte 2*SIZEOF_YMMWORD ; outptr
vmovdqa ymmA, ymmF
sub ecx, byte 2*SIZEOF_YMMWORD
jmp short .column_st31
.column_st32:
cmp ecx, byte SIZEOF_YMMWORD
jb short .column_st31
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
add edi, byte SIZEOF_YMMWORD ; outptr
vmovdqa ymmA, ymmD
sub ecx, byte SIZEOF_YMMWORD
jmp short .column_st31
.column_st31:
cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st15
vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
vperm2i128 ymmA, ymmA, ymmA, 1
sub ecx, byte SIZEOF_XMMWORD
.column_st15:
; Store the lower 8 bytes of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_MMWORD
jb short .column_st7
vmovq XMM_MMWORD [edi], xmmA
add edi, byte SIZEOF_MMWORD
sub ecx, byte SIZEOF_MMWORD
vpsrldq xmmA, xmmA, SIZEOF_MMWORD
.column_st7:
; Store the lower 4 bytes of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_DWORD
jb short .column_st3
vmovd XMM_DWORD [edi], xmmA
add edi, byte SIZEOF_DWORD
sub ecx, byte SIZEOF_DWORD
vpsrldq xmmA, xmmA, SIZEOF_DWORD
.column_st3:
; Store the lower 2 bytes of eax to the output when it has enough
; space.
vmovd eax, xmmA
cmp ecx, byte SIZEOF_WORD
jb short .column_st1
mov word [edi], ax
add edi, byte SIZEOF_WORD
sub ecx, byte SIZEOF_WORD
shr eax, 16
.column_st1:
; Store the lower 1 byte of eax to the output when it has enough
; space.
test ecx, ecx
jz short .endcolumn
mov byte [edi], al
%else ; RGB_PIXELSIZE == 4 ; -----------
%ifdef RGBX_FILLER_0XFF
vpcmpeqb ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********)
vpcmpeqb ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********)
%else
vpxor ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********)
vpxor ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********)
%endif
; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **)
; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **)
vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
; 2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U)
vpunpcklbw ymmB, ymmB, ymmD ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
; 0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V)
vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
; 2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V)
vpunpckhwd ymmC, ymmA, ymmE ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E
; 0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U)
vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36
; 0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M)
vpunpckhwd ymmG, ymmB, ymmF ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F
; 0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V)
vpunpcklwd ymmB, ymmB, ymmF ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37
; 0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N)
vpunpckhdq ymmE, ymmA, ymmB ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
vpunpckldq ymmB, ymmA, ymmB ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
vpunpckhdq ymmF, ymmC, ymmG ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
vpunpckldq ymmG, ymmC, ymmG ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
vperm2i128 ymmA, ymmB, ymmE, 0x20 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
vperm2i128 ymmD, ymmG, ymmF, 0x20 ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
vperm2i128 ymmC, ymmB, ymmE, 0x31 ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
vperm2i128 ymmH, ymmG, ymmF, 0x31 ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
cmp ecx, byte SIZEOF_YMMWORD
jb short .column_st64
test edi, SIZEOF_YMMWORD-1
jnz short .out1
; --(aligned)-------------------
vmovntdq YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
vmovntdq YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
vmovntdq YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
vmovntdq YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
jmp short .out0
.out1: ; --(unaligned)-----------------
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
vmovdqu YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
.out0:
add edi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr
sub ecx, byte SIZEOF_YMMWORD
jz near .endcolumn
add esi, byte SIZEOF_YMMWORD ; inptr0
dec al
jnz near .Yloop_2nd
add ebx, byte SIZEOF_YMMWORD ; inptr1
add edx, byte SIZEOF_YMMWORD ; inptr2
jmp near .columnloop
alignx 16, 7
.column_st64:
cmp ecx, byte SIZEOF_YMMWORD/2
jb short .column_st32
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
add edi, byte 2*SIZEOF_YMMWORD ; outptr
vmovdqa ymmA, ymmC
vmovdqa ymmD, ymmH
sub ecx, byte SIZEOF_YMMWORD/2
.column_st32:
cmp ecx, byte SIZEOF_YMMWORD/4
jb short .column_st16
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
add edi, byte SIZEOF_YMMWORD ; outptr
vmovdqa ymmA, ymmD
sub ecx, byte SIZEOF_YMMWORD/4
.column_st16:
cmp ecx, byte SIZEOF_YMMWORD/8
jb short .column_st15
vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
vperm2i128 ymmA, ymmA, ymmA, 1
sub ecx, byte SIZEOF_YMMWORD/8
.column_st15:
; Store two pixels (8 bytes) of ymmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_YMMWORD/16
jb short .column_st7
vmovq MMWORD [edi], xmmA
add edi, byte SIZEOF_YMMWORD/16*4
sub ecx, byte SIZEOF_YMMWORD/16
vpsrldq xmmA, SIZEOF_YMMWORD/16*4
.column_st7:
; Store one pixel (4 bytes) of ymmA to the output when it has enough
; space.
test ecx, ecx
jz short .endcolumn
vmovd XMM_DWORD [edi], xmmA
%endif ; RGB_PIXELSIZE ; ---------------
.endcolumn:
sfence ; flush the write buffer
.return:
vzeroupper
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
; --------------------------------------------------------------------------
;
; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
;
; GLOBAL(void)
; jsimd_h2v2_merged_upsample_avx2(JDIMENSION output_width,
; JSAMPIMAGE input_buf,
; JDIMENSION in_row_group_ctr,
; JSAMPARRAY output_buf);
;
%define output_width(b) (b) + 8 ; JDIMENSION output_width
%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr
%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
align 32
GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_avx2)
EXTN(jsimd_h2v2_merged_upsample_avx2):
push ebp
mov ebp, esp
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
mov eax, POINTER [output_width(ebp)]
mov edi, JSAMPIMAGE [input_buf(ebp)]
mov ecx, JDIMENSION [in_row_group_ctr(ebp)]
mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
mov edi, JSAMPARRAY [output_buf(ebp)]
lea esi, [esi+ecx*SIZEOF_JSAMPROW]
push edx ; inptr2
push ebx ; inptr1
push esi ; inptr00
mov ebx, esp
push edi ; output_buf (outptr0)
push ecx ; in_row_group_ctr
push ebx ; input_buf
push eax ; output_width
call near EXTN(jsimd_h2v1_merged_upsample_avx2)
add esi, byte SIZEOF_JSAMPROW ; inptr01
add edi, byte SIZEOF_JSAMPROW ; outptr1
mov POINTER [ebx+0*SIZEOF_POINTER], esi
mov POINTER [ebx-1*SIZEOF_POINTER], edi
call near EXTN(jsimd_h2v1_merged_upsample_avx2)
add esp, byte 7*SIZEOF_DWORD
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 32

@ -0,0 +1,460 @@
;
; jdmrgext.asm - merged upsampling/color conversion (MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
%include "jcolsamp.inc"
; --------------------------------------------------------------------------
;
; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
;
; GLOBAL(void)
; jsimd_h2v1_merged_upsample_mmx(JDIMENSION output_width, JSAMPIMAGE input_buf,
; JDIMENSION in_row_group_ctr,
; JSAMPARRAY output_buf);
;
%define output_width(b) (b) + 8 ; JDIMENSION output_width
%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr
%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
%define original_ebp ebp + 0
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM]
%define WK_NUM 3
%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
align 32
GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_mmx)
EXTN(jsimd_h2v1_merged_upsample_mmx):
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic eax ; make a room for GOT address
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
get_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [output_width(eax)] ; col
test ecx, ecx
jz near .return
push ecx
mov edi, JSAMPIMAGE [input_buf(eax)]
mov ecx, JDIMENSION [in_row_group_ctr(eax)]
mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
mov edi, JSAMPARRAY [output_buf(eax)]
mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0
mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1
mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2
mov edi, JSAMPROW [edi] ; outptr
pop ecx ; col
alignx 16, 7
.columnloop:
movpic eax, POINTER [gotptr] ; load GOT address (eax)
movq mm6, MMWORD [ebx] ; mm6=Cb(01234567)
movq mm7, MMWORD [edx] ; mm7=Cr(01234567)
pxor mm1, mm1 ; mm1=(all 0's)
pcmpeqw mm3, mm3
psllw mm3, 7 ; mm3={0xFF80 0xFF80 0xFF80 0xFF80}
movq mm4, mm6
punpckhbw mm6, mm1 ; mm6=Cb(4567)=CbH
punpcklbw mm4, mm1 ; mm4=Cb(0123)=CbL
movq mm0, mm7
punpckhbw mm7, mm1 ; mm7=Cr(4567)=CrH
punpcklbw mm0, mm1 ; mm0=Cr(0123)=CrL
paddw mm6, mm3
paddw mm4, mm3
paddw mm7, mm3
paddw mm0, mm3
; (Original)
; R = Y + 1.40200 * Cr
; G = Y - 0.34414 * Cb - 0.71414 * Cr
; B = Y + 1.77200 * Cb
;
; (This implementation)
; R = Y + 0.40200 * Cr + Cr
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
; B = Y - 0.22800 * Cb + Cb + Cb
movq mm5, mm6 ; mm5=CbH
movq mm2, mm4 ; mm2=CbL
paddw mm6, mm6 ; mm6=2*CbH
paddw mm4, mm4 ; mm4=2*CbL
movq mm1, mm7 ; mm1=CrH
movq mm3, mm0 ; mm3=CrL
paddw mm7, mm7 ; mm7=2*CrH
paddw mm0, mm0 ; mm0=2*CrL
pmulhw mm6, [GOTOFF(eax,PW_MF0228)] ; mm6=(2*CbH * -FIX(0.22800))
pmulhw mm4, [GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbL * -FIX(0.22800))
pmulhw mm7, [GOTOFF(eax,PW_F0402)] ; mm7=(2*CrH * FIX(0.40200))
pmulhw mm0, [GOTOFF(eax,PW_F0402)] ; mm0=(2*CrL * FIX(0.40200))
paddw mm6, [GOTOFF(eax,PW_ONE)]
paddw mm4, [GOTOFF(eax,PW_ONE)]
psraw mm6, 1 ; mm6=(CbH * -FIX(0.22800))
psraw mm4, 1 ; mm4=(CbL * -FIX(0.22800))
paddw mm7, [GOTOFF(eax,PW_ONE)]
paddw mm0, [GOTOFF(eax,PW_ONE)]
psraw mm7, 1 ; mm7=(CrH * FIX(0.40200))
psraw mm0, 1 ; mm0=(CrL * FIX(0.40200))
paddw mm6, mm5
paddw mm4, mm2
paddw mm6, mm5 ; mm6=(CbH * FIX(1.77200))=(B-Y)H
paddw mm4, mm2 ; mm4=(CbL * FIX(1.77200))=(B-Y)L
paddw mm7, mm1 ; mm7=(CrH * FIX(1.40200))=(R-Y)H
paddw mm0, mm3 ; mm0=(CrL * FIX(1.40200))=(R-Y)L
movq MMWORD [wk(0)], mm6 ; wk(0)=(B-Y)H
movq MMWORD [wk(1)], mm7 ; wk(1)=(R-Y)H
movq mm6, mm5
movq mm7, mm2
punpcklwd mm5, mm1
punpckhwd mm6, mm1
pmaddwd mm5, [GOTOFF(eax,PW_MF0344_F0285)]
pmaddwd mm6, [GOTOFF(eax,PW_MF0344_F0285)]
punpcklwd mm2, mm3
punpckhwd mm7, mm3
pmaddwd mm2, [GOTOFF(eax,PW_MF0344_F0285)]
pmaddwd mm7, [GOTOFF(eax,PW_MF0344_F0285)]
paddd mm5, [GOTOFF(eax,PD_ONEHALF)]
paddd mm6, [GOTOFF(eax,PD_ONEHALF)]
psrad mm5, SCALEBITS
psrad mm6, SCALEBITS
paddd mm2, [GOTOFF(eax,PD_ONEHALF)]
paddd mm7, [GOTOFF(eax,PD_ONEHALF)]
psrad mm2, SCALEBITS
psrad mm7, SCALEBITS
packssdw mm5, mm6 ; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
packssdw mm2, mm7 ; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
psubw mm5, mm1 ; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
psubw mm2, mm3 ; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
movq MMWORD [wk(2)], mm5 ; wk(2)=(G-Y)H
mov al, 2 ; Yctr
jmp short .Yloop_1st
alignx 16, 7
.Yloop_2nd:
movq mm0, MMWORD [wk(1)] ; mm0=(R-Y)H
movq mm2, MMWORD [wk(2)] ; mm2=(G-Y)H
movq mm4, MMWORD [wk(0)] ; mm4=(B-Y)H
alignx 16, 7
.Yloop_1st:
movq mm7, MMWORD [esi] ; mm7=Y(01234567)
pcmpeqw mm6, mm6
psrlw mm6, BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..}
pand mm6, mm7 ; mm6=Y(0246)=YE
psrlw mm7, BYTE_BIT ; mm7=Y(1357)=YO
movq mm1, mm0 ; mm1=mm0=(R-Y)(L/H)
movq mm3, mm2 ; mm3=mm2=(G-Y)(L/H)
movq mm5, mm4 ; mm5=mm4=(B-Y)(L/H)
paddw mm0, mm6 ; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6)
paddw mm1, mm7 ; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7)
packuswb mm0, mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **)
packuswb mm1, mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **)
paddw mm2, mm6 ; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6)
paddw mm3, mm7 ; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7)
packuswb mm2, mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **)
packuswb mm3, mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **)
paddw mm4, mm6 ; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6)
paddw mm5, mm7 ; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7)
packuswb mm4, mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **)
packuswb mm5, mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **)
%if RGB_PIXELSIZE == 3 ; ---------------
; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
punpcklbw mmA, mmC ; mmA=(00 10 02 12 04 14 06 16)
punpcklbw mmE, mmB ; mmE=(20 01 22 03 24 05 26 07)
punpcklbw mmD, mmF ; mmD=(11 21 13 23 15 25 17 27)
movq mmG, mmA
movq mmH, mmA
punpcklwd mmA, mmE ; mmA=(00 10 20 01 02 12 22 03)
punpckhwd mmG, mmE ; mmG=(04 14 24 05 06 16 26 07)
psrlq mmH, 2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --)
psrlq mmE, 2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --)
movq mmC, mmD
movq mmB, mmD
punpcklwd mmD, mmH ; mmD=(11 21 02 12 13 23 04 14)
punpckhwd mmC, mmH ; mmC=(15 25 06 16 17 27 -- --)
psrlq mmB, 2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --)
movq mmF, mmE
punpcklwd mmE, mmB ; mmE=(22 03 13 23 24 05 15 25)
punpckhwd mmF, mmB ; mmF=(26 07 17 27 -- -- -- --)
punpckldq mmA, mmD ; mmA=(00 10 20 01 11 21 02 12)
punpckldq mmE, mmG ; mmE=(22 03 13 23 04 14 24 05)
punpckldq mmC, mmF ; mmC=(15 25 06 16 26 07 17 27)
cmp ecx, byte SIZEOF_MMWORD
jb short .column_st16
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
sub ecx, byte SIZEOF_MMWORD
jz near .endcolumn
add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
add esi, byte SIZEOF_MMWORD ; inptr0
dec al ; Yctr
jnz near .Yloop_2nd
add ebx, byte SIZEOF_MMWORD ; inptr1
add edx, byte SIZEOF_MMWORD ; inptr2
jmp near .columnloop
alignx 16, 7
.column_st16:
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
cmp ecx, byte 2*SIZEOF_MMWORD
jb short .column_st8
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
movq mmA, mmC
sub ecx, byte 2*SIZEOF_MMWORD
add edi, byte 2*SIZEOF_MMWORD
jmp short .column_st4
.column_st8:
cmp ecx, byte SIZEOF_MMWORD
jb short .column_st4
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
movq mmA, mmE
sub ecx, byte SIZEOF_MMWORD
add edi, byte SIZEOF_MMWORD
.column_st4:
movd eax, mmA
cmp ecx, byte SIZEOF_DWORD
jb short .column_st2
mov dword [edi+0*SIZEOF_DWORD], eax
psrlq mmA, DWORD_BIT
movd eax, mmA
sub ecx, byte SIZEOF_DWORD
add edi, byte SIZEOF_DWORD
.column_st2:
cmp ecx, byte SIZEOF_WORD
jb short .column_st1
mov word [edi+0*SIZEOF_WORD], ax
shr eax, WORD_BIT
sub ecx, byte SIZEOF_WORD
add edi, byte SIZEOF_WORD
.column_st1:
cmp ecx, byte SIZEOF_BYTE
jb short .endcolumn
mov byte [edi+0*SIZEOF_BYTE], al
%else ; RGB_PIXELSIZE == 4 ; -----------
%ifdef RGBX_FILLER_0XFF
pcmpeqb mm6, mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **)
pcmpeqb mm7, mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **)
%else
pxor mm6, mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **)
pxor mm7, mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **)
%endif
; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
punpcklbw mmA, mmC ; mmA=(00 10 02 12 04 14 06 16)
punpcklbw mmE, mmG ; mmE=(20 30 22 32 24 34 26 36)
punpcklbw mmB, mmD ; mmB=(01 11 03 13 05 15 07 17)
punpcklbw mmF, mmH ; mmF=(21 31 23 33 25 35 27 37)
movq mmC, mmA
punpcklwd mmA, mmE ; mmA=(00 10 20 30 02 12 22 32)
punpckhwd mmC, mmE ; mmC=(04 14 24 34 06 16 26 36)
movq mmG, mmB
punpcklwd mmB, mmF ; mmB=(01 11 21 31 03 13 23 33)
punpckhwd mmG, mmF ; mmG=(05 15 25 35 07 17 27 37)
movq mmD, mmA
punpckldq mmA, mmB ; mmA=(00 10 20 30 01 11 21 31)
punpckhdq mmD, mmB ; mmD=(02 12 22 32 03 13 23 33)
movq mmH, mmC
punpckldq mmC, mmG ; mmC=(04 14 24 34 05 15 25 35)
punpckhdq mmH, mmG ; mmH=(06 16 26 36 07 17 27 37)
cmp ecx, byte SIZEOF_MMWORD
jb short .column_st16
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
movq MMWORD [edi+3*SIZEOF_MMWORD], mmH
sub ecx, byte SIZEOF_MMWORD
jz short .endcolumn
add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
add esi, byte SIZEOF_MMWORD ; inptr0
dec al ; Yctr
jnz near .Yloop_2nd
add ebx, byte SIZEOF_MMWORD ; inptr1
add edx, byte SIZEOF_MMWORD ; inptr2
jmp near .columnloop
alignx 16, 7
.column_st16:
cmp ecx, byte SIZEOF_MMWORD/2
jb short .column_st8
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
movq mmA, mmC
movq mmD, mmH
sub ecx, byte SIZEOF_MMWORD/2
add edi, byte 2*SIZEOF_MMWORD
.column_st8:
cmp ecx, byte SIZEOF_MMWORD/4
jb short .column_st4
movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
movq mmA, mmD
sub ecx, byte SIZEOF_MMWORD/4
add edi, byte 1*SIZEOF_MMWORD
.column_st4:
cmp ecx, byte SIZEOF_MMWORD/8
jb short .endcolumn
movd dword [edi+0*SIZEOF_DWORD], mmA
%endif ; RGB_PIXELSIZE ; ---------------
.endcolumn:
emms ; empty MMX state
.return:
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
; --------------------------------------------------------------------------
;
; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
;
; GLOBAL(void)
; jsimd_h2v2_merged_upsample_mmx(JDIMENSION output_width, JSAMPIMAGE input_buf,
; JDIMENSION in_row_group_ctr,
; JSAMPARRAY output_buf);
;
%define output_width(b) (b) + 8 ; JDIMENSION output_width
%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr
%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
align 32
GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_mmx)
EXTN(jsimd_h2v2_merged_upsample_mmx):
push ebp
mov ebp, esp
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
mov eax, JDIMENSION [output_width(ebp)]
mov edi, JSAMPIMAGE [input_buf(ebp)]
mov ecx, JDIMENSION [in_row_group_ctr(ebp)]
mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
mov edi, JSAMPARRAY [output_buf(ebp)]
lea esi, [esi+ecx*SIZEOF_JSAMPROW]
push edx ; inptr2
push ebx ; inptr1
push esi ; inptr00
mov ebx, esp
push edi ; output_buf (outptr0)
push ecx ; in_row_group_ctr
push ebx ; input_buf
push eax ; output_width
call near EXTN(jsimd_h2v1_merged_upsample_mmx)
add esi, byte SIZEOF_JSAMPROW ; inptr01
add edi, byte SIZEOF_JSAMPROW ; outptr1
mov POINTER [ebx+0*SIZEOF_POINTER], esi
mov POINTER [ebx-1*SIZEOF_POINTER], edi
call near EXTN(jsimd_h2v1_merged_upsample_mmx)
add esp, byte 7*SIZEOF_DWORD
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 32

@ -0,0 +1,517 @@
;
; jdmrgext.asm - merged upsampling/color conversion (SSE2)
;
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2012, 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
%include "jcolsamp.inc"
; --------------------------------------------------------------------------
;
; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
;
; GLOBAL(void)
; jsimd_h2v1_merged_upsample_sse2(JDIMENSION output_width,
; JSAMPIMAGE input_buf,
; JDIMENSION in_row_group_ctr,
; JSAMPARRAY output_buf);
;
%define output_width(b) (b) + 8 ; JDIMENSION output_width
%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr
%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
%define original_ebp ebp + 0
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
; xmmword wk[WK_NUM]
%define WK_NUM 3
%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr
align 32
GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_sse2)
EXTN(jsimd_h2v1_merged_upsample_sse2):
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic eax ; make a room for GOT address
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
get_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [output_width(eax)] ; col
test ecx, ecx
jz near .return
push ecx
mov edi, JSAMPIMAGE [input_buf(eax)]
mov ecx, JDIMENSION [in_row_group_ctr(eax)]
mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
mov edi, JSAMPARRAY [output_buf(eax)]
mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0
mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1
mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2
mov edi, JSAMPROW [edi] ; outptr
pop ecx ; col
alignx 16, 7
.columnloop:
movpic eax, POINTER [gotptr] ; load GOT address (eax)
movdqa xmm6, XMMWORD [ebx] ; xmm6=Cb(0123456789ABCDEF)
movdqa xmm7, XMMWORD [edx] ; xmm7=Cr(0123456789ABCDEF)
pxor xmm1, xmm1 ; xmm1=(all 0's)
pcmpeqw xmm3, xmm3
psllw xmm3, 7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
movdqa xmm4, xmm6
punpckhbw xmm6, xmm1 ; xmm6=Cb(89ABCDEF)=CbH
punpcklbw xmm4, xmm1 ; xmm4=Cb(01234567)=CbL
movdqa xmm0, xmm7
punpckhbw xmm7, xmm1 ; xmm7=Cr(89ABCDEF)=CrH
punpcklbw xmm0, xmm1 ; xmm0=Cr(01234567)=CrL
paddw xmm6, xmm3
paddw xmm4, xmm3
paddw xmm7, xmm3
paddw xmm0, xmm3
; (Original)
; R = Y + 1.40200 * Cr
; G = Y - 0.34414 * Cb - 0.71414 * Cr
; B = Y + 1.77200 * Cb
;
; (This implementation)
; R = Y + 0.40200 * Cr + Cr
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
; B = Y - 0.22800 * Cb + Cb + Cb
movdqa xmm5, xmm6 ; xmm5=CbH
movdqa xmm2, xmm4 ; xmm2=CbL
paddw xmm6, xmm6 ; xmm6=2*CbH
paddw xmm4, xmm4 ; xmm4=2*CbL
movdqa xmm1, xmm7 ; xmm1=CrH
movdqa xmm3, xmm0 ; xmm3=CrL
paddw xmm7, xmm7 ; xmm7=2*CrH
paddw xmm0, xmm0 ; xmm0=2*CrL
pmulhw xmm6, [GOTOFF(eax,PW_MF0228)] ; xmm6=(2*CbH * -FIX(0.22800))
pmulhw xmm4, [GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbL * -FIX(0.22800))
pmulhw xmm7, [GOTOFF(eax,PW_F0402)] ; xmm7=(2*CrH * FIX(0.40200))
pmulhw xmm0, [GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrL * FIX(0.40200))
paddw xmm6, [GOTOFF(eax,PW_ONE)]
paddw xmm4, [GOTOFF(eax,PW_ONE)]
psraw xmm6, 1 ; xmm6=(CbH * -FIX(0.22800))
psraw xmm4, 1 ; xmm4=(CbL * -FIX(0.22800))
paddw xmm7, [GOTOFF(eax,PW_ONE)]
paddw xmm0, [GOTOFF(eax,PW_ONE)]
psraw xmm7, 1 ; xmm7=(CrH * FIX(0.40200))
psraw xmm0, 1 ; xmm0=(CrL * FIX(0.40200))
paddw xmm6, xmm5
paddw xmm4, xmm2
paddw xmm6, xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
paddw xmm4, xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
paddw xmm7, xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
paddw xmm0, xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H
movdqa xmm6, xmm5
movdqa xmm7, xmm2
punpcklwd xmm5, xmm1
punpckhwd xmm6, xmm1
pmaddwd xmm5, [GOTOFF(eax,PW_MF0344_F0285)]
pmaddwd xmm6, [GOTOFF(eax,PW_MF0344_F0285)]
punpcklwd xmm2, xmm3
punpckhwd xmm7, xmm3
pmaddwd xmm2, [GOTOFF(eax,PW_MF0344_F0285)]
pmaddwd xmm7, [GOTOFF(eax,PW_MF0344_F0285)]
paddd xmm5, [GOTOFF(eax,PD_ONEHALF)]
paddd xmm6, [GOTOFF(eax,PD_ONEHALF)]
psrad xmm5, SCALEBITS
psrad xmm6, SCALEBITS
paddd xmm2, [GOTOFF(eax,PD_ONEHALF)]
paddd xmm7, [GOTOFF(eax,PD_ONEHALF)]
psrad xmm2, SCALEBITS
psrad xmm7, SCALEBITS
packssdw xmm5, xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
packssdw xmm2, xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
psubw xmm5, xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
psubw xmm2, xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H
mov al, 2 ; Yctr
jmp short .Yloop_1st
alignx 16, 7
.Yloop_2nd:
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H
movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H
movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H
alignx 16, 7
.Yloop_1st:
movdqa xmm7, XMMWORD [esi] ; xmm7=Y(0123456789ABCDEF)
pcmpeqw xmm6, xmm6
psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
pand xmm6, xmm7 ; xmm6=Y(02468ACE)=YE
psrlw xmm7, BYTE_BIT ; xmm7=Y(13579BDF)=YO
movdqa xmm1, xmm0 ; xmm1=xmm0=(R-Y)(L/H)
movdqa xmm3, xmm2 ; xmm3=xmm2=(G-Y)(L/H)
movdqa xmm5, xmm4 ; xmm5=xmm4=(B-Y)(L/H)
paddw xmm0, xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
paddw xmm1, xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
packuswb xmm0, xmm0 ; xmm0=R(02468ACE********)
packuswb xmm1, xmm1 ; xmm1=R(13579BDF********)
paddw xmm2, xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
paddw xmm3, xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
packuswb xmm2, xmm2 ; xmm2=G(02468ACE********)
packuswb xmm3, xmm3 ; xmm3=G(13579BDF********)
paddw xmm4, xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
paddw xmm5, xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
packuswb xmm4, xmm4 ; xmm4=B(02468ACE********)
packuswb xmm5, xmm5 ; xmm5=B(13579BDF********)
%if RGB_PIXELSIZE == 3 ; ---------------
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
movdqa xmmG, xmmA
movdqa xmmH, xmmA
punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
movdqa xmmC, xmmD
movdqa xmmB, xmmD
punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
movdqa xmmF, xmmE
punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
movdqa xmmB, xmmE
punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
movdqa xmmB, xmmF
punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st32
test edi, SIZEOF_XMMWORD-1
jnz short .out1
; --(aligned)-------------------
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
jmp short .out0
.out1: ; --(unaligned)-----------------
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
.out0:
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD
jz near .endcolumn
add esi, byte SIZEOF_XMMWORD ; inptr0
dec al ; Yctr
jnz near .Yloop_2nd
add ebx, byte SIZEOF_XMMWORD ; inptr1
add edx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop
alignx 16, 7
.column_st32:
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
cmp ecx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA, xmmF
sub ecx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15
.column_st16:
cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st15
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA, xmmD
sub ecx, byte SIZEOF_XMMWORD
.column_st15:
; Store the lower 8 bytes of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_MMWORD
jb short .column_st7
movq XMM_MMWORD [edi], xmmA
add edi, byte SIZEOF_MMWORD
sub ecx, byte SIZEOF_MMWORD
psrldq xmmA, SIZEOF_MMWORD
.column_st7:
; Store the lower 4 bytes of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_DWORD
jb short .column_st3
movd XMM_DWORD [edi], xmmA
add edi, byte SIZEOF_DWORD
sub ecx, byte SIZEOF_DWORD
psrldq xmmA, SIZEOF_DWORD
.column_st3:
; Store the lower 2 bytes of eax to the output when it has enough
; space.
movd eax, xmmA
cmp ecx, byte SIZEOF_WORD
jb short .column_st1
mov word [edi], ax
add edi, byte SIZEOF_WORD
sub ecx, byte SIZEOF_WORD
shr eax, 16
.column_st1:
; Store the lower 1 byte of eax to the output when it has enough
; space.
test ecx, ecx
jz short .endcolumn
mov byte [edi], al
%else ; RGB_PIXELSIZE == 4 ; -----------
%ifdef RGBX_FILLER_0XFF
pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%else
pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********)
pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********)
%endif
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
movdqa xmmC, xmmA
punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
movdqa xmmG, xmmB
punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
movdqa xmmD, xmmA
punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
movdqa xmmH, xmmC
punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st32
test edi, SIZEOF_XMMWORD-1
jnz short .out1
; --(aligned)-------------------
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
jmp short .out0
.out1: ; --(unaligned)-----------------
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
.out0:
add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD
jz near .endcolumn
add esi, byte SIZEOF_XMMWORD ; inptr0
dec al ; Yctr
jnz near .Yloop_2nd
add ebx, byte SIZEOF_XMMWORD ; inptr1
add edx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop
alignx 16, 7
.column_st32:
cmp ecx, byte SIZEOF_XMMWORD/2
jb short .column_st16
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA, xmmC
movdqa xmmD, xmmH
sub ecx, byte SIZEOF_XMMWORD/2
.column_st16:
cmp ecx, byte SIZEOF_XMMWORD/4
jb short .column_st15
movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA, xmmD
sub ecx, byte SIZEOF_XMMWORD/4
.column_st15:
; Store two pixels (8 bytes) of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_XMMWORD/8
jb short .column_st7
movq XMM_MMWORD [edi], xmmA
add edi, byte SIZEOF_XMMWORD/8*4
sub ecx, byte SIZEOF_XMMWORD/8
psrldq xmmA, SIZEOF_XMMWORD/8*4
.column_st7:
; Store one pixel (4 bytes) of xmmA to the output when it has enough
; space.
test ecx, ecx
jz short .endcolumn
movd XMM_DWORD [edi], xmmA
%endif ; RGB_PIXELSIZE ; ---------------
.endcolumn:
sfence ; flush the write buffer
.return:
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
; --------------------------------------------------------------------------
;
; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
;
; GLOBAL(void)
; jsimd_h2v2_merged_upsample_sse2(JDIMENSION output_width,
; JSAMPIMAGE input_buf,
; JDIMENSION in_row_group_ctr,
; JSAMPARRAY output_buf);
;
%define output_width(b) (b) + 8 ; JDIMENSION output_width
%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf
%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr
%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf
align 32
GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_sse2)
EXTN(jsimd_h2v2_merged_upsample_sse2):
push ebp
mov ebp, esp
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
mov eax, POINTER [output_width(ebp)]
mov edi, JSAMPIMAGE [input_buf(ebp)]
mov ecx, JDIMENSION [in_row_group_ctr(ebp)]
mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
mov edi, JSAMPARRAY [output_buf(ebp)]
lea esi, [esi+ecx*SIZEOF_JSAMPROW]
push edx ; inptr2
push ebx ; inptr1
push esi ; inptr00
mov ebx, esp
push edi ; output_buf (outptr0)
push ecx ; in_row_group_ctr
push ebx ; input_buf
push eax ; output_width
call near EXTN(jsimd_h2v1_merged_upsample_sse2)
add esi, byte SIZEOF_JSAMPROW ; inptr01
add edi, byte SIZEOF_JSAMPROW ; outptr1
mov POINTER [ebx+0*SIZEOF_POINTER], esi
mov POINTER [ebx-1*SIZEOF_POINTER], edi
call near EXTN(jsimd_h2v1_merged_upsample_sse2)
add esp, byte 7*SIZEOF_DWORD
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 32

@ -0,0 +1,760 @@
;
; jdsample.asm - upsampling (AVX2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2015, Intel Corporation.
; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
%include "jsimdext.inc"
; --------------------------------------------------------------------------
SECTION SEG_CONST
alignz 32
GLOBAL_DATA(jconst_fancy_upsample_avx2)
EXTN(jconst_fancy_upsample_avx2):
PW_ONE times 16 dw 1
PW_TWO times 16 dw 2
PW_THREE times 16 dw 3
PW_SEVEN times 16 dw 7
PW_EIGHT times 16 dw 8
alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
;
; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
;
; The upsampling algorithm is linear interpolation between pixel centers,
; also known as a "triangle filter". This is a good compromise between
; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
; of the way between input pixel centers.
;
; GLOBAL(void)
; jsimd_h2v1_fancy_upsample_avx2(int max_v_samp_factor,
; JDIMENSION downsampled_width,
; JSAMPARRAY input_data,
; JSAMPARRAY *output_data_ptr);
;
%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width
%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
align 32
GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2)
EXTN(jsimd_h2v1_fancy_upsample_avx2):
push ebp
mov ebp, esp
pushpic ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
get_GOT ebx ; get GOT address
mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr
test eax, eax
jz near .return
mov ecx, INT [max_v_samp(ebp)] ; rowctr
test ecx, ecx
jz near .return
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, POINTER [output_data_ptr(ebp)]
mov edi, JSAMPARRAY [edi] ; output_data
alignx 16, 7
.rowloop:
push eax ; colctr
push edi
push esi
mov esi, JSAMPROW [esi] ; inptr
mov edi, JSAMPROW [edi] ; outptr
test eax, SIZEOF_YMMWORD-1
jz short .skip
mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
.skip:
vpxor ymm0, ymm0, ymm0 ; ymm0=(all 0's)
vpcmpeqb xmm7, xmm7, xmm7
vpsrldq xmm7, xmm7, (SIZEOF_XMMWORD-1) ; (ff -- -- -- ... -- --) LSB is ff
vpand ymm7, ymm7, YMMWORD [esi+0*SIZEOF_YMMWORD]
add eax, byte SIZEOF_YMMWORD-1
and eax, byte -SIZEOF_YMMWORD
cmp eax, byte SIZEOF_YMMWORD
ja short .columnloop
alignx 16, 7
.columnloop_last:
vpcmpeqb xmm6, xmm6, xmm6
vpslldq xmm6, xmm6, (SIZEOF_XMMWORD-1)
vperm2i128 ymm6, ymm6, ymm6, 1 ; (---- ---- ... ---- ---- ff) MSB is ff
vpand ymm6, ymm6, YMMWORD [esi+0*SIZEOF_YMMWORD]
jmp short .upsample
alignx 16, 7
.columnloop:
vmovdqu ymm6, YMMWORD [esi+1*SIZEOF_YMMWORD]
vperm2i128 ymm6, ymm0, ymm6, 0x20
vpslldq ymm6, ymm6, 15
.upsample:
vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD] ; ymm1=( 0 1 2 ... 29 30 31)
vperm2i128 ymm2, ymm0, ymm1, 0x20
vpalignr ymm2, ymm1, ymm2, 15 ; ymm2=(-- 0 1 ... 28 29 30)
vperm2i128 ymm4, ymm0, ymm1, 0x03
vpalignr ymm3, ymm4, ymm1, 1 ; ymm3=( 1 2 3 ... 30 31 --)
vpor ymm2, ymm2, ymm7 ; ymm2=(-1 0 1 ... 28 29 30)
vpor ymm3, ymm3, ymm6 ; ymm3=( 1 2 3 ... 30 31 32)
vpsrldq ymm7, ymm4, (SIZEOF_XMMWORD-1) ; ymm7=(31 -- -- ... -- -- --)
vpunpckhbw ymm4, ymm1, ymm0 ; ymm4=( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
vpunpcklbw ymm5, ymm1, ymm0 ; ymm5=( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
vperm2i128 ymm1, ymm5, ymm4, 0x20 ; ymm1=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
vpunpckhbw ymm5, ymm2, ymm0 ; ymm5=( 7 8 9 10 11 12 13 14 23 24 25 26 27 28 29 30)
vpunpcklbw ymm6, ymm2, ymm0 ; ymm6=(-1 0 1 2 3 4 5 6 15 16 17 18 19 20 21 22)
vperm2i128 ymm2, ymm6, ymm5, 0x20 ; ymm2=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
vpunpckhbw ymm6, ymm3, ymm0 ; ymm6=( 1 2 3 4 5 6 7 8 17 18 19 20 21 22 23 24)
vpunpcklbw ymm0, ymm3, ymm0 ; ymm0=( 9 10 11 12 13 14 15 16 25 26 27 28 29 30 31 32)
vperm2i128 ymm3, ymm0, ymm6, 0x20 ; ymm3=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16)
vperm2i128 ymm6, ymm0, ymm6, 0x31 ; ymm6=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
vpxor ymm0, ymm0, ymm0 ; ymm0=(all 0's)
vpmullw ymm1, ymm1, [GOTOFF(ebx,PW_THREE)]
vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
vpaddw ymm2, ymm2, [GOTOFF(ebx,PW_ONE)]
vpaddw ymm5, ymm5, [GOTOFF(ebx,PW_ONE)]
vpaddw ymm3, ymm3, [GOTOFF(ebx,PW_TWO)]
vpaddw ymm6, ymm6, [GOTOFF(ebx,PW_TWO)]
vpaddw ymm2, ymm2, ymm1
vpaddw ymm5, ymm5, ymm4
vpsrlw ymm2, ymm2, 2 ; ymm2=OutLE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30)
vpsrlw ymm5, ymm5, 2 ; ymm5=OutHE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
vpaddw ymm3, ymm3, ymm1
vpaddw ymm6, ymm6, ymm4
vpsrlw ymm3, ymm3, 2 ; ymm3=OutLO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31)
vpsrlw ymm6, ymm6, 2 ; ymm6=OutHO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
vpsllw ymm3, ymm3, BYTE_BIT
vpsllw ymm6, ymm6, BYTE_BIT
vpor ymm2, ymm2, ymm3 ; ymm2=OutL=( 0 1 2 ... 29 30 31)
vpor ymm5, ymm5, ymm6 ; ymm5=OutH=(32 33 34 ... 61 62 63)
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm2
vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm5
sub eax, byte SIZEOF_YMMWORD
add esi, byte 1*SIZEOF_YMMWORD ; inptr
add edi, byte 2*SIZEOF_YMMWORD ; outptr
cmp eax, byte SIZEOF_YMMWORD
ja near .columnloop
test eax, eax
jnz near .columnloop_last
pop esi
pop edi
pop eax
add esi, byte SIZEOF_JSAMPROW ; input_data
add edi, byte SIZEOF_JSAMPROW ; output_data
dec ecx ; rowctr
jg near .rowloop
.return:
vzeroupper
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
poppic ebx
pop ebp
ret
; --------------------------------------------------------------------------
;
; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
; Again a triangle filter; see comments for h2v1 case, above.
;
; GLOBAL(void)
; jsimd_h2v2_fancy_upsample_avx2(int max_v_samp_factor,
; JDIMENSION downsampled_width,
; JSAMPARRAY input_data,
; JSAMPARRAY *output_data_ptr);
;
%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width
%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
%define original_ebp ebp + 0
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
; ymmword wk[WK_NUM]
%define WK_NUM 4
%define gotptr wk(0) - SIZEOF_POINTER ; void *gotptr
align 32
GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2)
EXTN(jsimd_h2v2_fancy_upsample_avx2):
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic eax ; make a room for GOT address
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
get_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address
mov edx, eax ; edx = original ebp
mov eax, JDIMENSION [downsamp_width(edx)] ; colctr
test eax, eax
jz near .return
mov ecx, INT [max_v_samp(edx)] ; rowctr
test ecx, ecx
jz near .return
mov esi, JSAMPARRAY [input_data(edx)] ; input_data
mov edi, POINTER [output_data_ptr(edx)]
mov edi, JSAMPARRAY [edi] ; output_data
alignx 16, 7
.rowloop:
push eax ; colctr
push ecx
push edi
push esi
mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above)
mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below)
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
test eax, SIZEOF_YMMWORD-1
jz short .skip
push edx
mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
pop edx
.skip:
; -- process the first column block
vmovdqu ymm0, YMMWORD [ebx+0*SIZEOF_YMMWORD] ; ymm0=row[ 0][0]
vmovdqu ymm1, YMMWORD [ecx+0*SIZEOF_YMMWORD] ; ymm1=row[-1][0]
vmovdqu ymm2, YMMWORD [esi+0*SIZEOF_YMMWORD] ; ymm2=row[+1][0]
pushpic ebx
movpic ebx, POINTER [gotptr] ; load GOT address
vpxor ymm3, ymm3, ymm3 ; ymm3=(all 0's)
vpunpckhbw ymm4, ymm0, ymm3 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
vpunpcklbw ymm5, ymm0, ymm3 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
vpunpckhbw ymm5, ymm1, ymm3 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
vpunpcklbw ymm6, ymm1, ymm3 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
vpunpckhbw ymm6, ymm2, ymm3 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
vpunpcklbw ymm3, ymm2, ymm3 ; ymm3=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
vperm2i128 ymm2, ymm3, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
vperm2i128 ymm6, ymm3, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
vpmullw ymm0, ymm0, [GOTOFF(ebx,PW_THREE)]
vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
vpcmpeqb xmm7, xmm7, xmm7
vpsrldq xmm7, xmm7, (SIZEOF_XMMWORD-2) ; (ffff ---- ---- ... ---- ----) LSB is ffff
vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
vmovdqu YMMWORD [edx+0*SIZEOF_YMMWORD], ymm1 ; temporarily save
vmovdqu YMMWORD [edx+1*SIZEOF_YMMWORD], ymm5 ; the intermediate data
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm2
vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm6
vpand ymm1, ymm1, ymm7 ; ymm1=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
vpand ymm2, ymm2, ymm7 ; ymm2=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
vmovdqa YMMWORD [wk(0)], ymm1
vmovdqa YMMWORD [wk(1)], ymm2
poppic ebx
add eax, byte SIZEOF_YMMWORD-1
and eax, byte -SIZEOF_YMMWORD
cmp eax, byte SIZEOF_YMMWORD
ja short .columnloop
alignx 16, 7
.columnloop_last:
; -- process the last column block
pushpic ebx
movpic ebx, POINTER [gotptr] ; load GOT address
vpcmpeqb xmm1, xmm1, xmm1
vpslldq xmm1, xmm1, (SIZEOF_XMMWORD-2)
vperm2i128 ymm1, ymm1, ymm1, 1 ; (---- ---- ... ---- ---- ffff) MSB is ffff
vpand ymm2, ymm1, YMMWORD [edi+1*SIZEOF_YMMWORD]
vpand ymm1, ymm1, YMMWORD [edx+1*SIZEOF_YMMWORD]
vmovdqa YMMWORD [wk(2)], ymm1 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
vmovdqa YMMWORD [wk(3)], ymm2 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
jmp near .upsample
alignx 16, 7
.columnloop:
; -- process the next column block
vmovdqu ymm0, YMMWORD [ebx+1*SIZEOF_YMMWORD] ; ymm0=row[ 0][1]
vmovdqu ymm1, YMMWORD [ecx+1*SIZEOF_YMMWORD] ; ymm1=row[-1][1]
vmovdqu ymm2, YMMWORD [esi+1*SIZEOF_YMMWORD] ; ymm2=row[+1][1]
pushpic ebx
movpic ebx, POINTER [gotptr] ; load GOT address
vpxor ymm3, ymm3, ymm3 ; ymm3=(all 0's)
vpunpckhbw ymm4, ymm0, ymm3 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
vpunpcklbw ymm5, ymm0, ymm3 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
vpunpckhbw ymm5, ymm1, ymm3 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
vpunpcklbw ymm6, ymm1, ymm3 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
vpunpckhbw ymm6, ymm2, ymm3 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
vpunpcklbw ymm7, ymm2, ymm3 ; ymm7=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
vperm2i128 ymm2, ymm7, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
vperm2i128 ymm6, ymm7, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
vpmullw ymm0, ymm0, [GOTOFF(ebx,PW_THREE)]
vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
vmovdqu YMMWORD [edx+2*SIZEOF_YMMWORD], ymm1 ; temporarily save
vmovdqu YMMWORD [edx+3*SIZEOF_YMMWORD], ymm5 ; the intermediate data
vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymm2
vmovdqu YMMWORD [edi+3*SIZEOF_YMMWORD], ymm6
vperm2i128 ymm1, ymm3, ymm1, 0x20
vpslldq ymm1, ymm1, 14 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0)
vperm2i128 ymm2, ymm3, ymm2, 0x20
vpslldq ymm2, ymm2, 14 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0)
vmovdqa YMMWORD [wk(2)], ymm1
vmovdqa YMMWORD [wk(3)], ymm2
.upsample:
; -- process the upper row
vmovdqu ymm7, YMMWORD [edx+0*SIZEOF_YMMWORD] ; ymm7=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
vmovdqu ymm3, YMMWORD [edx+1*SIZEOF_YMMWORD] ; ymm3=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's)
vperm2i128 ymm0, ymm1, ymm7, 0x03
vpalignr ymm0, ymm0, ymm7, 2 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --)
vperm2i128 ymm4, ymm1, ymm3, 0x20
vpslldq ymm4, ymm4, 14 ; ymm4=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
vperm2i128 ymm5, ymm1, ymm7, 0x03
vpsrldq ymm5, ymm5, 14 ; ymm5=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
vperm2i128 ymm6, ymm1, ymm3, 0x20
vpalignr ymm6, ymm3, ymm6, 14 ; ymm6=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
vpor ymm0, ymm0, ymm4 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16)
vpor ymm5, ymm5, ymm6 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
vperm2i128 ymm2, ymm1, ymm3, 0x03
vpalignr ymm2, ymm2, ymm3, 2 ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
vperm2i128 ymm4, ymm1, ymm3, 0x03
vpsrldq ymm4, ymm4, 14 ; ymm4=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
vperm2i128 ymm1, ymm1, ymm7, 0x20
vpalignr ymm1, ymm7, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
vpor ymm1, ymm1, YMMWORD [wk(0)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
vpor ymm2, ymm2, YMMWORD [wk(2)] ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
vmovdqa YMMWORD [wk(0)], ymm4
vpmullw ymm7, ymm7, [GOTOFF(ebx,PW_THREE)]
vpmullw ymm3, ymm3, [GOTOFF(ebx,PW_THREE)]
vpaddw ymm1, ymm1, [GOTOFF(ebx,PW_EIGHT)]
vpaddw ymm5, ymm5, [GOTOFF(ebx,PW_EIGHT)]
vpaddw ymm0, ymm0, [GOTOFF(ebx,PW_SEVEN)]
vpaddw ymm2, [GOTOFF(ebx,PW_SEVEN)]
vpaddw ymm1, ymm1, ymm7
vpaddw ymm5, ymm5, ymm3
vpsrlw ymm1, ymm1, 4 ; ymm1=Out0LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30)
vpsrlw ymm5, ymm5, 4 ; ymm5=Out0HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
vpaddw ymm0, ymm0, ymm7
vpaddw ymm2, ymm2, ymm3
vpsrlw ymm0, ymm0, 4 ; ymm0=Out0LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31)
vpsrlw ymm2, ymm2, 4 ; ymm2=Out0HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
vpsllw ymm0, ymm0, BYTE_BIT
vpsllw ymm2, ymm2, BYTE_BIT
vpor ymm1, ymm1, ymm0 ; ymm1=Out0L=( 0 1 2 ... 29 30 31)
vpor ymm5, ymm5, ymm2 ; ymm5=Out0H=(32 33 34 ... 61 62 63)
vmovdqu YMMWORD [edx+0*SIZEOF_YMMWORD], ymm1
vmovdqu YMMWORD [edx+1*SIZEOF_YMMWORD], ymm5
; -- process the lower row
vmovdqu ymm6, YMMWORD [edi+0*SIZEOF_YMMWORD] ; ymm6=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
vmovdqu ymm4, YMMWORD [edi+1*SIZEOF_YMMWORD] ; ymm4=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's)
vperm2i128 ymm7, ymm1, ymm6, 0x03
vpalignr ymm7, ymm7, ymm6, 2 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --)
vperm2i128 ymm3, ymm1, ymm4, 0x20
vpslldq ymm3, ymm3, 14 ; ymm3=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
vperm2i128 ymm0, ymm1, ymm6, 0x03
vpsrldq ymm0, ymm0, 14 ; ymm0=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
vperm2i128 ymm2, ymm1, ymm4, 0x20
vpalignr ymm2, ymm4, ymm2, 14 ; ymm2=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
vpor ymm7, ymm7, ymm3 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16)
vpor ymm0, ymm0, ymm2 ; ymm0=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
vperm2i128 ymm5, ymm1, ymm4, 0x03
vpalignr ymm5, ymm5, ymm4, 2 ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
vperm2i128 ymm3, ymm1, ymm4, 0x03
vpsrldq ymm3, ymm3, 14 ; ymm3=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
vperm2i128 ymm1, ymm1, ymm6, 0x20
vpalignr ymm1, ymm6, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
vpor ymm1, ymm1, YMMWORD [wk(1)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
vpor ymm5, ymm5, YMMWORD [wk(3)] ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
vmovdqa YMMWORD [wk(1)], ymm3
vpmullw ymm6, ymm6, [GOTOFF(ebx,PW_THREE)]
vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
vpaddw ymm1, ymm1, [GOTOFF(ebx,PW_EIGHT)]
vpaddw ymm0, ymm0, [GOTOFF(ebx,PW_EIGHT)]
vpaddw ymm7, ymm7, [GOTOFF(ebx,PW_SEVEN)]
vpaddw ymm5, ymm5, [GOTOFF(ebx,PW_SEVEN)]
vpaddw ymm1, ymm1, ymm6
vpaddw ymm0, ymm0, ymm4
vpsrlw ymm1, ymm1, 4 ; ymm1=Out1LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30)
vpsrlw ymm0, ymm0, 4 ; ymm0=Out1HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
vpaddw ymm7, ymm7, ymm6
vpaddw ymm5, ymm5, ymm4
vpsrlw ymm7, ymm7, 4 ; ymm7=Out1LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31)
vpsrlw ymm5, ymm5, 4 ; ymm5=Out1HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
vpsllw ymm7, ymm7, BYTE_BIT
vpsllw ymm5, ymm5, BYTE_BIT
vpor ymm1, ymm1, ymm7 ; ymm1=Out1L=( 0 1 2 ... 29 30 31)
vpor ymm0, ymm0, ymm5 ; ymm0=Out1H=(32 33 34 ... 61 62 63)
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm1
vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm0
poppic ebx
sub eax, byte SIZEOF_YMMWORD
add ecx, byte 1*SIZEOF_YMMWORD ; inptr1(above)
add ebx, byte 1*SIZEOF_YMMWORD ; inptr0
add esi, byte 1*SIZEOF_YMMWORD ; inptr1(below)
add edx, byte 2*SIZEOF_YMMWORD ; outptr0
add edi, byte 2*SIZEOF_YMMWORD ; outptr1
cmp eax, byte SIZEOF_YMMWORD
ja near .columnloop
test eax, eax
jnz near .columnloop_last
pop esi
pop edi
pop ecx
pop eax
add esi, byte 1*SIZEOF_JSAMPROW ; input_data
add edi, byte 2*SIZEOF_JSAMPROW ; output_data
sub ecx, byte 2 ; rowctr
jg near .rowloop
.return:
vzeroupper
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
; --------------------------------------------------------------------------
;
; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
; It's still a box filter.
;
; GLOBAL(void)
; jsimd_h2v1_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
;
%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
%define output_width(b) (b) + 12 ; JDIMENSION output_width
%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
align 32
GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2)
EXTN(jsimd_h2v1_upsample_avx2):
push ebp
mov ebp, esp
; push ebx ; unused
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
mov edx, JDIMENSION [output_width(ebp)]
add edx, byte (SIZEOF_YMMWORD-1)
and edx, -SIZEOF_YMMWORD
jz short .return
mov ecx, INT [max_v_samp(ebp)] ; rowctr
test ecx, ecx
jz short .return
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, POINTER [output_data_ptr(ebp)]
mov edi, JSAMPARRAY [edi] ; output_data
alignx 16, 7
.rowloop:
push edi
push esi
mov esi, JSAMPROW [esi] ; inptr
mov edi, JSAMPROW [edi] ; outptr
mov eax, edx ; colctr
alignx 16, 7
.columnloop:
cmp eax, byte SIZEOF_YMMWORD
ja near .above_16
vmovdqu xmm0, XMMWORD [esi+0*SIZEOF_YMMWORD]
vpunpckhbw xmm1, xmm0, xmm0
vpunpcklbw xmm0, xmm0, xmm0
vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
vmovdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
jmp short .nextrow
.above_16:
vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
vpermq ymm0, ymm0, 0xd8
vpunpckhbw ymm1, ymm0, ymm0
vpunpcklbw ymm0, ymm0, ymm0
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm1
sub eax, byte 2*SIZEOF_YMMWORD
jz short .nextrow
add esi, byte SIZEOF_YMMWORD ; inptr
add edi, byte 2*SIZEOF_YMMWORD ; outptr
jmp short .columnloop
alignx 16, 7
.nextrow:
pop esi
pop edi
add esi, byte SIZEOF_JSAMPROW ; input_data
add edi, byte SIZEOF_JSAMPROW ; output_data
dec ecx ; rowctr
jg short .rowloop
.return:
vzeroupper
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
; pop ebx ; unused
pop ebp
ret
; --------------------------------------------------------------------------
;
; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
; It's still a box filter.
;
; GLOBAL(void)
; jsimd_h2v2_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
;
%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
%define output_width(b) (b) + 12 ; JDIMENSION output_width
%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
align 32
GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2)
EXTN(jsimd_h2v2_upsample_avx2):
push ebp
mov ebp, esp
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
mov edx, JDIMENSION [output_width(ebp)]
add edx, byte (SIZEOF_YMMWORD-1)
and edx, -SIZEOF_YMMWORD
jz near .return
mov ecx, INT [max_v_samp(ebp)] ; rowctr
test ecx, ecx
jz near .return
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, POINTER [output_data_ptr(ebp)]
mov edi, JSAMPARRAY [edi] ; output_data
alignx 16, 7
.rowloop:
push edi
push esi
mov esi, JSAMPROW [esi] ; inptr
mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
mov eax, edx ; colctr
alignx 16, 7
.columnloop:
cmp eax, byte SIZEOF_YMMWORD
ja short .above_16
vmovdqu xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
vpunpckhbw xmm1, xmm0, xmm0
vpunpcklbw xmm0, xmm0, xmm0
vmovdqu XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
vmovdqu XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
vmovdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
jmp near .nextrow
.above_16:
vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
vpermq ymm0, ymm0, 0xd8
vpunpckhbw ymm1, ymm0, ymm0
vpunpcklbw ymm0, ymm0, ymm0
vmovdqu YMMWORD [ebx+0*SIZEOF_YMMWORD], ymm0
vmovdqu YMMWORD [ebx+1*SIZEOF_YMMWORD], ymm1
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm1
sub eax, byte 2*SIZEOF_YMMWORD
jz short .nextrow
add esi, byte SIZEOF_YMMWORD ; inptr
add ebx, 2*SIZEOF_YMMWORD ; outptr0
add edi, 2*SIZEOF_YMMWORD ; outptr1
jmp short .columnloop
alignx 16, 7
.nextrow:
pop esi
pop edi
add esi, byte 1*SIZEOF_JSAMPROW ; input_data
add edi, byte 2*SIZEOF_JSAMPROW ; output_data
sub ecx, byte 2 ; rowctr
jg near .rowloop
.return:
vzeroupper
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 32

@ -0,0 +1,731 @@
;
; jdsample.asm - upsampling (MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
%include "jsimdext.inc"
; --------------------------------------------------------------------------
SECTION SEG_CONST
alignz 32
GLOBAL_DATA(jconst_fancy_upsample_mmx)
EXTN(jconst_fancy_upsample_mmx):
PW_ONE times 4 dw 1
PW_TWO times 4 dw 2
PW_THREE times 4 dw 3
PW_SEVEN times 4 dw 7
PW_EIGHT times 4 dw 8
alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
;
; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
;
; The upsampling algorithm is linear interpolation between pixel centers,
; also known as a "triangle filter". This is a good compromise between
; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
; of the way between input pixel centers.
;
; GLOBAL(void)
; jsimd_h2v1_fancy_upsample_mmx(int max_v_samp_factor,
; JDIMENSION downsampled_width,
; JSAMPARRAY input_data,
; JSAMPARRAY *output_data_ptr);
;
%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width
%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
align 32
GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_mmx)
EXTN(jsimd_h2v1_fancy_upsample_mmx):
push ebp
mov ebp, esp
pushpic ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
get_GOT ebx ; get GOT address
mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr
test eax, eax
jz near .return
mov ecx, INT [max_v_samp(ebp)] ; rowctr
test ecx, ecx
jz near .return
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, POINTER [output_data_ptr(ebp)]
mov edi, JSAMPARRAY [edi] ; output_data
alignx 16, 7
.rowloop:
push eax ; colctr
push edi
push esi
mov esi, JSAMPROW [esi] ; inptr
mov edi, JSAMPROW [edi] ; outptr
test eax, SIZEOF_MMWORD-1
jz short .skip
mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
.skip:
pxor mm0, mm0 ; mm0=(all 0's)
pcmpeqb mm7, mm7
psrlq mm7, (SIZEOF_MMWORD-1)*BYTE_BIT
pand mm7, MMWORD [esi+0*SIZEOF_MMWORD]
add eax, byte SIZEOF_MMWORD-1
and eax, byte -SIZEOF_MMWORD
cmp eax, byte SIZEOF_MMWORD
ja short .columnloop
alignx 16, 7
.columnloop_last:
pcmpeqb mm6, mm6
psllq mm6, (SIZEOF_MMWORD-1)*BYTE_BIT
pand mm6, MMWORD [esi+0*SIZEOF_MMWORD]
jmp short .upsample
alignx 16, 7
.columnloop:
movq mm6, MMWORD [esi+1*SIZEOF_MMWORD]
psllq mm6, (SIZEOF_MMWORD-1)*BYTE_BIT
.upsample:
movq mm1, MMWORD [esi+0*SIZEOF_MMWORD]
movq mm2, mm1
movq mm3, mm1 ; mm1=( 0 1 2 3 4 5 6 7)
psllq mm2, BYTE_BIT ; mm2=( - 0 1 2 3 4 5 6)
psrlq mm3, BYTE_BIT ; mm3=( 1 2 3 4 5 6 7 -)
por mm2, mm7 ; mm2=(-1 0 1 2 3 4 5 6)
por mm3, mm6 ; mm3=( 1 2 3 4 5 6 7 8)
movq mm7, mm1
psrlq mm7, (SIZEOF_MMWORD-1)*BYTE_BIT ; mm7=( 7 - - - - - - -)
movq mm4, mm1
punpcklbw mm1, mm0 ; mm1=( 0 1 2 3)
punpckhbw mm4, mm0 ; mm4=( 4 5 6 7)
movq mm5, mm2
punpcklbw mm2, mm0 ; mm2=(-1 0 1 2)
punpckhbw mm5, mm0 ; mm5=( 3 4 5 6)
movq mm6, mm3
punpcklbw mm3, mm0 ; mm3=( 1 2 3 4)
punpckhbw mm6, mm0 ; mm6=( 5 6 7 8)
pmullw mm1, [GOTOFF(ebx,PW_THREE)]
pmullw mm4, [GOTOFF(ebx,PW_THREE)]
paddw mm2, [GOTOFF(ebx,PW_ONE)]
paddw mm5, [GOTOFF(ebx,PW_ONE)]
paddw mm3, [GOTOFF(ebx,PW_TWO)]
paddw mm6, [GOTOFF(ebx,PW_TWO)]
paddw mm2, mm1
paddw mm5, mm4
psrlw mm2, 2 ; mm2=OutLE=( 0 2 4 6)
psrlw mm5, 2 ; mm5=OutHE=( 8 10 12 14)
paddw mm3, mm1
paddw mm6, mm4
psrlw mm3, 2 ; mm3=OutLO=( 1 3 5 7)
psrlw mm6, 2 ; mm6=OutHO=( 9 11 13 15)
psllw mm3, BYTE_BIT
psllw mm6, BYTE_BIT
por mm2, mm3 ; mm2=OutL=( 0 1 2 3 4 5 6 7)
por mm5, mm6 ; mm5=OutH=( 8 9 10 11 12 13 14 15)
movq MMWORD [edi+0*SIZEOF_MMWORD], mm2
movq MMWORD [edi+1*SIZEOF_MMWORD], mm5
sub eax, byte SIZEOF_MMWORD
add esi, byte 1*SIZEOF_MMWORD ; inptr
add edi, byte 2*SIZEOF_MMWORD ; outptr
cmp eax, byte SIZEOF_MMWORD
ja near .columnloop
test eax, eax
jnz near .columnloop_last
pop esi
pop edi
pop eax
add esi, byte SIZEOF_JSAMPROW ; input_data
add edi, byte SIZEOF_JSAMPROW ; output_data
dec ecx ; rowctr
jg near .rowloop
emms ; empty MMX state
.return:
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
poppic ebx
pop ebp
ret
; --------------------------------------------------------------------------
;
; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
; Again a triangle filter; see comments for h2v1 case, above.
;
; GLOBAL(void)
; jsimd_h2v2_fancy_upsample_mmx(int max_v_samp_factor,
; JDIMENSION downsampled_width,
; JSAMPARRAY input_data,
; JSAMPARRAY *output_data_ptr);
;
%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width
%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
%define original_ebp ebp + 0
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM]
%define WK_NUM 4
%define gotptr wk(0) - SIZEOF_POINTER ; void *gotptr
align 32
GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_mmx)
EXTN(jsimd_h2v2_fancy_upsample_mmx):
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic eax ; make a room for GOT address
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
get_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address
mov edx, eax ; edx = original ebp
mov eax, JDIMENSION [downsamp_width(edx)] ; colctr
test eax, eax
jz near .return
mov ecx, INT [max_v_samp(edx)] ; rowctr
test ecx, ecx
jz near .return
mov esi, JSAMPARRAY [input_data(edx)] ; input_data
mov edi, POINTER [output_data_ptr(edx)]
mov edi, JSAMPARRAY [edi] ; output_data
alignx 16, 7
.rowloop:
push eax ; colctr
push ecx
push edi
push esi
mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above)
mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below)
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
test eax, SIZEOF_MMWORD-1
jz short .skip
push edx
mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
pop edx
.skip:
; -- process the first column block
movq mm0, MMWORD [ebx+0*SIZEOF_MMWORD] ; mm0=row[ 0][0]
movq mm1, MMWORD [ecx+0*SIZEOF_MMWORD] ; mm1=row[-1][0]
movq mm2, MMWORD [esi+0*SIZEOF_MMWORD] ; mm2=row[+1][0]
pushpic ebx
movpic ebx, POINTER [gotptr] ; load GOT address
pxor mm3, mm3 ; mm3=(all 0's)
movq mm4, mm0
punpcklbw mm0, mm3 ; mm0=row[ 0][0]( 0 1 2 3)
punpckhbw mm4, mm3 ; mm4=row[ 0][0]( 4 5 6 7)
movq mm5, mm1
punpcklbw mm1, mm3 ; mm1=row[-1][0]( 0 1 2 3)
punpckhbw mm5, mm3 ; mm5=row[-1][0]( 4 5 6 7)
movq mm6, mm2
punpcklbw mm2, mm3 ; mm2=row[+1][0]( 0 1 2 3)
punpckhbw mm6, mm3 ; mm6=row[+1][0]( 4 5 6 7)
pmullw mm0, [GOTOFF(ebx,PW_THREE)]
pmullw mm4, [GOTOFF(ebx,PW_THREE)]
pcmpeqb mm7, mm7
psrlq mm7, (SIZEOF_MMWORD-2)*BYTE_BIT
paddw mm1, mm0 ; mm1=Int0L=( 0 1 2 3)
paddw mm5, mm4 ; mm5=Int0H=( 4 5 6 7)
paddw mm2, mm0 ; mm2=Int1L=( 0 1 2 3)
paddw mm6, mm4 ; mm6=Int1H=( 4 5 6 7)
movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 ; temporarily save
movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 ; the intermediate data
movq MMWORD [edi+0*SIZEOF_MMWORD], mm2
movq MMWORD [edi+1*SIZEOF_MMWORD], mm6
pand mm1, mm7 ; mm1=( 0 - - -)
pand mm2, mm7 ; mm2=( 0 - - -)
movq MMWORD [wk(0)], mm1
movq MMWORD [wk(1)], mm2
poppic ebx
add eax, byte SIZEOF_MMWORD-1
and eax, byte -SIZEOF_MMWORD
cmp eax, byte SIZEOF_MMWORD
ja short .columnloop
alignx 16, 7
.columnloop_last:
; -- process the last column block
pushpic ebx
movpic ebx, POINTER [gotptr] ; load GOT address
pcmpeqb mm1, mm1
psllq mm1, (SIZEOF_MMWORD-2)*BYTE_BIT
movq mm2, mm1
pand mm1, MMWORD [edx+1*SIZEOF_MMWORD] ; mm1=( - - - 7)
pand mm2, MMWORD [edi+1*SIZEOF_MMWORD] ; mm2=( - - - 7)
movq MMWORD [wk(2)], mm1
movq MMWORD [wk(3)], mm2
jmp short .upsample
alignx 16, 7
.columnloop:
; -- process the next column block
movq mm0, MMWORD [ebx+1*SIZEOF_MMWORD] ; mm0=row[ 0][1]
movq mm1, MMWORD [ecx+1*SIZEOF_MMWORD] ; mm1=row[-1][1]
movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] ; mm2=row[+1][1]
pushpic ebx
movpic ebx, POINTER [gotptr] ; load GOT address
pxor mm3, mm3 ; mm3=(all 0's)
movq mm4, mm0
punpcklbw mm0, mm3 ; mm0=row[ 0][1]( 0 1 2 3)
punpckhbw mm4, mm3 ; mm4=row[ 0][1]( 4 5 6 7)
movq mm5, mm1
punpcklbw mm1, mm3 ; mm1=row[-1][1]( 0 1 2 3)
punpckhbw mm5, mm3 ; mm5=row[-1][1]( 4 5 6 7)
movq mm6, mm2
punpcklbw mm2, mm3 ; mm2=row[+1][1]( 0 1 2 3)
punpckhbw mm6, mm3 ; mm6=row[+1][1]( 4 5 6 7)
pmullw mm0, [GOTOFF(ebx,PW_THREE)]
pmullw mm4, [GOTOFF(ebx,PW_THREE)]
paddw mm1, mm0 ; mm1=Int0L=( 0 1 2 3)
paddw mm5, mm4 ; mm5=Int0H=( 4 5 6 7)
paddw mm2, mm0 ; mm2=Int1L=( 0 1 2 3)
paddw mm6, mm4 ; mm6=Int1H=( 4 5 6 7)
movq MMWORD [edx+2*SIZEOF_MMWORD], mm1 ; temporarily save
movq MMWORD [edx+3*SIZEOF_MMWORD], mm5 ; the intermediate data
movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
movq MMWORD [edi+3*SIZEOF_MMWORD], mm6
psllq mm1, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm1=( - - - 0)
psllq mm2, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm2=( - - - 0)
movq MMWORD [wk(2)], mm1
movq MMWORD [wk(3)], mm2
.upsample:
; -- process the upper row
movq mm7, MMWORD [edx+0*SIZEOF_MMWORD] ; mm7=Int0L=( 0 1 2 3)
movq mm3, MMWORD [edx+1*SIZEOF_MMWORD] ; mm3=Int0H=( 4 5 6 7)
movq mm0, mm7
movq mm4, mm3
psrlq mm0, 2*BYTE_BIT ; mm0=( 1 2 3 -)
psllq mm4, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( - - - 4)
movq mm5, mm7
movq mm6, mm3
psrlq mm5, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm5=( 3 - - -)
psllq mm6, 2*BYTE_BIT ; mm6=( - 4 5 6)
por mm0, mm4 ; mm0=( 1 2 3 4)
por mm5, mm6 ; mm5=( 3 4 5 6)
movq mm1, mm7
movq mm2, mm3
psllq mm1, 2*BYTE_BIT ; mm1=( - 0 1 2)
psrlq mm2, 2*BYTE_BIT ; mm2=( 5 6 7 -)
movq mm4, mm3
psrlq mm4, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( 7 - - -)
por mm1, MMWORD [wk(0)] ; mm1=(-1 0 1 2)
por mm2, MMWORD [wk(2)] ; mm2=( 5 6 7 8)
movq MMWORD [wk(0)], mm4
pmullw mm7, [GOTOFF(ebx,PW_THREE)]
pmullw mm3, [GOTOFF(ebx,PW_THREE)]
paddw mm1, [GOTOFF(ebx,PW_EIGHT)]
paddw mm5, [GOTOFF(ebx,PW_EIGHT)]
paddw mm0, [GOTOFF(ebx,PW_SEVEN)]
paddw mm2, [GOTOFF(ebx,PW_SEVEN)]
paddw mm1, mm7
paddw mm5, mm3
psrlw mm1, 4 ; mm1=Out0LE=( 0 2 4 6)
psrlw mm5, 4 ; mm5=Out0HE=( 8 10 12 14)
paddw mm0, mm7
paddw mm2, mm3
psrlw mm0, 4 ; mm0=Out0LO=( 1 3 5 7)
psrlw mm2, 4 ; mm2=Out0HO=( 9 11 13 15)
psllw mm0, BYTE_BIT
psllw mm2, BYTE_BIT
por mm1, mm0 ; mm1=Out0L=( 0 1 2 3 4 5 6 7)
por mm5, mm2 ; mm5=Out0H=( 8 9 10 11 12 13 14 15)
movq MMWORD [edx+0*SIZEOF_MMWORD], mm1
movq MMWORD [edx+1*SIZEOF_MMWORD], mm5
; -- process the lower row
movq mm6, MMWORD [edi+0*SIZEOF_MMWORD] ; mm6=Int1L=( 0 1 2 3)
movq mm4, MMWORD [edi+1*SIZEOF_MMWORD] ; mm4=Int1H=( 4 5 6 7)
movq mm7, mm6
movq mm3, mm4
psrlq mm7, 2*BYTE_BIT ; mm7=( 1 2 3 -)
psllq mm3, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( - - - 4)
movq mm0, mm6
movq mm2, mm4
psrlq mm0, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm0=( 3 - - -)
psllq mm2, 2*BYTE_BIT ; mm2=( - 4 5 6)
por mm7, mm3 ; mm7=( 1 2 3 4)
por mm0, mm2 ; mm0=( 3 4 5 6)
movq mm1, mm6
movq mm5, mm4
psllq mm1, 2*BYTE_BIT ; mm1=( - 0 1 2)
psrlq mm5, 2*BYTE_BIT ; mm5=( 5 6 7 -)
movq mm3, mm4
psrlq mm3, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( 7 - - -)
por mm1, MMWORD [wk(1)] ; mm1=(-1 0 1 2)
por mm5, MMWORD [wk(3)] ; mm5=( 5 6 7 8)
movq MMWORD [wk(1)], mm3
pmullw mm6, [GOTOFF(ebx,PW_THREE)]
pmullw mm4, [GOTOFF(ebx,PW_THREE)]
paddw mm1, [GOTOFF(ebx,PW_EIGHT)]
paddw mm0, [GOTOFF(ebx,PW_EIGHT)]
paddw mm7, [GOTOFF(ebx,PW_SEVEN)]
paddw mm5, [GOTOFF(ebx,PW_SEVEN)]
paddw mm1, mm6
paddw mm0, mm4
psrlw mm1, 4 ; mm1=Out1LE=( 0 2 4 6)
psrlw mm0, 4 ; mm0=Out1HE=( 8 10 12 14)
paddw mm7, mm6
paddw mm5, mm4
psrlw mm7, 4 ; mm7=Out1LO=( 1 3 5 7)
psrlw mm5, 4 ; mm5=Out1HO=( 9 11 13 15)
psllw mm7, BYTE_BIT
psllw mm5, BYTE_BIT
por mm1, mm7 ; mm1=Out1L=( 0 1 2 3 4 5 6 7)
por mm0, mm5 ; mm0=Out1H=( 8 9 10 11 12 13 14 15)
movq MMWORD [edi+0*SIZEOF_MMWORD], mm1
movq MMWORD [edi+1*SIZEOF_MMWORD], mm0
poppic ebx
sub eax, byte SIZEOF_MMWORD
add ecx, byte 1*SIZEOF_MMWORD ; inptr1(above)
add ebx, byte 1*SIZEOF_MMWORD ; inptr0
add esi, byte 1*SIZEOF_MMWORD ; inptr1(below)
add edx, byte 2*SIZEOF_MMWORD ; outptr0
add edi, byte 2*SIZEOF_MMWORD ; outptr1
cmp eax, byte SIZEOF_MMWORD
ja near .columnloop
test eax, eax
jnz near .columnloop_last
pop esi
pop edi
pop ecx
pop eax
add esi, byte 1*SIZEOF_JSAMPROW ; input_data
add edi, byte 2*SIZEOF_JSAMPROW ; output_data
sub ecx, byte 2 ; rowctr
jg near .rowloop
emms ; empty MMX state
.return:
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
; --------------------------------------------------------------------------
;
; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
; It's still a box filter.
;
; GLOBAL(void)
; jsimd_h2v1_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width,
; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
;
%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
%define output_width(b) (b) + 12 ; JDIMENSION output_width
%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
align 32
GLOBAL_FUNCTION(jsimd_h2v1_upsample_mmx)
EXTN(jsimd_h2v1_upsample_mmx):
push ebp
mov ebp, esp
; push ebx ; unused
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
mov edx, JDIMENSION [output_width(ebp)]
add edx, byte (2*SIZEOF_MMWORD)-1
and edx, byte -(2*SIZEOF_MMWORD)
jz short .return
mov ecx, INT [max_v_samp(ebp)] ; rowctr
test ecx, ecx
jz short .return
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, POINTER [output_data_ptr(ebp)]
mov edi, JSAMPARRAY [edi] ; output_data
alignx 16, 7
.rowloop:
push edi
push esi
mov esi, JSAMPROW [esi] ; inptr
mov edi, JSAMPROW [edi] ; outptr
mov eax, edx ; colctr
alignx 16, 7
.columnloop:
movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
movq mm1, mm0
punpcklbw mm0, mm0
punpckhbw mm1, mm1
movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
movq MMWORD [edi+1*SIZEOF_MMWORD], mm1
sub eax, byte 2*SIZEOF_MMWORD
jz short .nextrow
movq mm2, MMWORD [esi+1*SIZEOF_MMWORD]
movq mm3, mm2
punpcklbw mm2, mm2
punpckhbw mm3, mm3
movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
movq MMWORD [edi+3*SIZEOF_MMWORD], mm3
sub eax, byte 2*SIZEOF_MMWORD
jz short .nextrow
add esi, byte 2*SIZEOF_MMWORD ; inptr
add edi, byte 4*SIZEOF_MMWORD ; outptr
jmp short .columnloop
alignx 16, 7
.nextrow:
pop esi
pop edi
add esi, byte SIZEOF_JSAMPROW ; input_data
add edi, byte SIZEOF_JSAMPROW ; output_data
dec ecx ; rowctr
jg short .rowloop
emms ; empty MMX state
.return:
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
; pop ebx ; unused
pop ebp
ret
; --------------------------------------------------------------------------
;
; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
; It's still a box filter.
;
; GLOBAL(void)
; jsimd_h2v2_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width,
; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
;
%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
%define output_width(b) (b) + 12 ; JDIMENSION output_width
%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
align 32
GLOBAL_FUNCTION(jsimd_h2v2_upsample_mmx)
EXTN(jsimd_h2v2_upsample_mmx):
push ebp
mov ebp, esp
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
mov edx, JDIMENSION [output_width(ebp)]
add edx, byte (2*SIZEOF_MMWORD)-1
and edx, byte -(2*SIZEOF_MMWORD)
jz near .return
mov ecx, INT [max_v_samp(ebp)] ; rowctr
test ecx, ecx
jz short .return
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, POINTER [output_data_ptr(ebp)]
mov edi, JSAMPARRAY [edi] ; output_data
alignx 16, 7
.rowloop:
push edi
push esi
mov esi, JSAMPROW [esi] ; inptr
mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
mov eax, edx ; colctr
alignx 16, 7
.columnloop:
movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
movq mm1, mm0
punpcklbw mm0, mm0
punpckhbw mm1, mm1
movq MMWORD [ebx+0*SIZEOF_MMWORD], mm0
movq MMWORD [ebx+1*SIZEOF_MMWORD], mm1
movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
movq MMWORD [edi+1*SIZEOF_MMWORD], mm1
sub eax, byte 2*SIZEOF_MMWORD
jz short .nextrow
movq mm2, MMWORD [esi+1*SIZEOF_MMWORD]
movq mm3, mm2
punpcklbw mm2, mm2
punpckhbw mm3, mm3
movq MMWORD [ebx+2*SIZEOF_MMWORD], mm2
movq MMWORD [ebx+3*SIZEOF_MMWORD], mm3
movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
movq MMWORD [edi+3*SIZEOF_MMWORD], mm3
sub eax, byte 2*SIZEOF_MMWORD
jz short .nextrow
add esi, byte 2*SIZEOF_MMWORD ; inptr
add ebx, byte 4*SIZEOF_MMWORD ; outptr0
add edi, byte 4*SIZEOF_MMWORD ; outptr1
jmp short .columnloop
alignx 16, 7
.nextrow:
pop esi
pop edi
add esi, byte 1*SIZEOF_JSAMPROW ; input_data
add edi, byte 2*SIZEOF_JSAMPROW ; output_data
sub ecx, byte 2 ; rowctr
jg short .rowloop
emms ; empty MMX state
.return:
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 32

@ -0,0 +1,724 @@
;
; jdsample.asm - upsampling (SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
%include "jsimdext.inc"
; --------------------------------------------------------------------------
SECTION SEG_CONST
alignz 32
GLOBAL_DATA(jconst_fancy_upsample_sse2)
EXTN(jconst_fancy_upsample_sse2):
PW_ONE times 8 dw 1
PW_TWO times 8 dw 2
PW_THREE times 8 dw 3
PW_SEVEN times 8 dw 7
PW_EIGHT times 8 dw 8
alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
;
; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
;
; The upsampling algorithm is linear interpolation between pixel centers,
; also known as a "triangle filter". This is a good compromise between
; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
; of the way between input pixel centers.
;
; GLOBAL(void)
; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor,
; JDIMENSION downsampled_width,
; JSAMPARRAY input_data,
; JSAMPARRAY *output_data_ptr);
;
%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width
%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
align 32
GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2)
EXTN(jsimd_h2v1_fancy_upsample_sse2):
push ebp
mov ebp, esp
pushpic ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
get_GOT ebx ; get GOT address
mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr
test eax, eax
jz near .return
mov ecx, INT [max_v_samp(ebp)] ; rowctr
test ecx, ecx
jz near .return
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, POINTER [output_data_ptr(ebp)]
mov edi, JSAMPARRAY [edi] ; output_data
alignx 16, 7
.rowloop:
push eax ; colctr
push edi
push esi
mov esi, JSAMPROW [esi] ; inptr
mov edi, JSAMPROW [edi] ; outptr
test eax, SIZEOF_XMMWORD-1
jz short .skip
mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
.skip:
pxor xmm0, xmm0 ; xmm0=(all 0's)
pcmpeqb xmm7, xmm7
psrldq xmm7, (SIZEOF_XMMWORD-1)
pand xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD]
add eax, byte SIZEOF_XMMWORD-1
and eax, byte -SIZEOF_XMMWORD
cmp eax, byte SIZEOF_XMMWORD
ja short .columnloop
alignx 16, 7
.columnloop_last:
pcmpeqb xmm6, xmm6
pslldq xmm6, (SIZEOF_XMMWORD-1)
pand xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]
jmp short .upsample
alignx 16, 7
.columnloop:
movdqa xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]
pslldq xmm6, (SIZEOF_XMMWORD-1)
.upsample:
movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqa xmm2, xmm1
movdqa xmm3, xmm1 ; xmm1=( 0 1 2 ... 13 14 15)
pslldq xmm2, 1 ; xmm2=(-- 0 1 ... 12 13 14)
psrldq xmm3, 1 ; xmm3=( 1 2 3 ... 14 15 --)
por xmm2, xmm7 ; xmm2=(-1 0 1 ... 12 13 14)
por xmm3, xmm6 ; xmm3=( 1 2 3 ... 14 15 16)
movdqa xmm7, xmm1
psrldq xmm7, (SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
movdqa xmm4, xmm1
punpcklbw xmm1, xmm0 ; xmm1=( 0 1 2 3 4 5 6 7)
punpckhbw xmm4, xmm0 ; xmm4=( 8 9 10 11 12 13 14 15)
movdqa xmm5, xmm2
punpcklbw xmm2, xmm0 ; xmm2=(-1 0 1 2 3 4 5 6)
punpckhbw xmm5, xmm0 ; xmm5=( 7 8 9 10 11 12 13 14)
movdqa xmm6, xmm3
punpcklbw xmm3, xmm0 ; xmm3=( 1 2 3 4 5 6 7 8)
punpckhbw xmm6, xmm0 ; xmm6=( 9 10 11 12 13 14 15 16)
pmullw xmm1, [GOTOFF(ebx,PW_THREE)]
pmullw xmm4, [GOTOFF(ebx,PW_THREE)]
paddw xmm2, [GOTOFF(ebx,PW_ONE)]
paddw xmm5, [GOTOFF(ebx,PW_ONE)]
paddw xmm3, [GOTOFF(ebx,PW_TWO)]
paddw xmm6, [GOTOFF(ebx,PW_TWO)]
paddw xmm2, xmm1
paddw xmm5, xmm4
psrlw xmm2, 2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14)
psrlw xmm5, 2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
paddw xmm3, xmm1
paddw xmm6, xmm4
psrlw xmm3, 2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15)
psrlw xmm6, 2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
psllw xmm3, BYTE_BIT
psllw xmm6, BYTE_BIT
por xmm2, xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15)
por xmm5, xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31)
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5
sub eax, byte SIZEOF_XMMWORD
add esi, byte 1*SIZEOF_XMMWORD ; inptr
add edi, byte 2*SIZEOF_XMMWORD ; outptr
cmp eax, byte SIZEOF_XMMWORD
ja near .columnloop
test eax, eax
jnz near .columnloop_last
pop esi
pop edi
pop eax
add esi, byte SIZEOF_JSAMPROW ; input_data
add edi, byte SIZEOF_JSAMPROW ; output_data
dec ecx ; rowctr
jg near .rowloop
.return:
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
poppic ebx
pop ebp
ret
; --------------------------------------------------------------------------
;
; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
; Again a triangle filter; see comments for h2v1 case, above.
;
; GLOBAL(void)
; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor,
; JDIMENSION downsampled_width,
; JSAMPARRAY input_data,
; JSAMPARRAY *output_data_ptr);
;
%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width
%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
%define original_ebp ebp + 0
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
; xmmword wk[WK_NUM]
%define WK_NUM 4
%define gotptr wk(0) - SIZEOF_POINTER ; void *gotptr
align 32
GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2)
EXTN(jsimd_h2v2_fancy_upsample_sse2):
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic eax ; make a room for GOT address
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
get_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address
mov edx, eax ; edx = original ebp
mov eax, JDIMENSION [downsamp_width(edx)] ; colctr
test eax, eax
jz near .return
mov ecx, INT [max_v_samp(edx)] ; rowctr
test ecx, ecx
jz near .return
mov esi, JSAMPARRAY [input_data(edx)] ; input_data
mov edi, POINTER [output_data_ptr(edx)]
mov edi, JSAMPARRAY [edi] ; output_data
alignx 16, 7
.rowloop:
push eax ; colctr
push ecx
push edi
push esi
mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above)
mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below)
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
test eax, SIZEOF_XMMWORD-1
jz short .skip
push edx
mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
pop edx
.skip:
; -- process the first column block
movdqa xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0]
movdqa xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0]
movdqa xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0]
pushpic ebx
movpic ebx, POINTER [gotptr] ; load GOT address
pxor xmm3, xmm3 ; xmm3=(all 0's)
movdqa xmm4, xmm0
punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
movdqa xmm5, xmm1
punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
movdqa xmm6, xmm2
punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
pmullw xmm0, [GOTOFF(ebx,PW_THREE)]
pmullw xmm4, [GOTOFF(ebx,PW_THREE)]
pcmpeqb xmm7, xmm7
psrldq xmm7, (SIZEOF_XMMWORD-2)
paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save
movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6
pand xmm1, xmm7 ; xmm1=( 0 -- -- -- -- -- -- --)
pand xmm2, xmm7 ; xmm2=( 0 -- -- -- -- -- -- --)
movdqa XMMWORD [wk(0)], xmm1
movdqa XMMWORD [wk(1)], xmm2
poppic ebx
add eax, byte SIZEOF_XMMWORD-1
and eax, byte -SIZEOF_XMMWORD
cmp eax, byte SIZEOF_XMMWORD
ja short .columnloop
alignx 16, 7
.columnloop_last:
; -- process the last column block
pushpic ebx
movpic ebx, POINTER [gotptr] ; load GOT address
pcmpeqb xmm1, xmm1
pslldq xmm1, (SIZEOF_XMMWORD-2)
movdqa xmm2, xmm1
pand xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
pand xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15)
movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15)
jmp near .upsample
alignx 16, 7
.columnloop:
; -- process the next column block
movdqa xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1]
movdqa xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1]
movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1]
pushpic ebx
movpic ebx, POINTER [gotptr] ; load GOT address
pxor xmm3, xmm3 ; xmm3=(all 0's)
movdqa xmm4, xmm0
punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
movdqa xmm5, xmm1
punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
movdqa xmm6, xmm2
punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
pmullw xmm0, [GOTOFF(ebx,PW_THREE)]
pmullw xmm4, [GOTOFF(ebx,PW_THREE)]
paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
movdqa XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save
movdqa XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data
movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6
pslldq xmm1, (SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0)
pslldq xmm2, (SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0)
movdqa XMMWORD [wk(2)], xmm1
movdqa XMMWORD [wk(3)], xmm2
.upsample:
; -- process the upper row
movdqa xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD]
movdqa xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD]
movdqa xmm0, xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7)
movdqa xmm4, xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15)
psrldq xmm0, 2 ; xmm0=( 1 2 3 4 5 6 7 --)
pslldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8)
movdqa xmm5, xmm7
movdqa xmm6, xmm3
psrldq xmm5, (SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
pslldq xmm6, 2 ; xmm6=(-- 8 9 10 11 12 13 14)
por xmm0, xmm4 ; xmm0=( 1 2 3 4 5 6 7 8)
por xmm5, xmm6 ; xmm5=( 7 8 9 10 11 12 13 14)
movdqa xmm1, xmm7
movdqa xmm2, xmm3
pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6)
psrldq xmm2, 2 ; xmm2=( 9 10 11 12 13 14 15 --)
movdqa xmm4, xmm3
psrldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6)
por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16)
movdqa XMMWORD [wk(0)], xmm4
pmullw xmm7, [GOTOFF(ebx,PW_THREE)]
pmullw xmm3, [GOTOFF(ebx,PW_THREE)]
paddw xmm1, [GOTOFF(ebx,PW_EIGHT)]
paddw xmm5, [GOTOFF(ebx,PW_EIGHT)]
paddw xmm0, [GOTOFF(ebx,PW_SEVEN)]
paddw xmm2, [GOTOFF(ebx,PW_SEVEN)]
paddw xmm1, xmm7
paddw xmm5, xmm3
psrlw xmm1, 4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14)
psrlw xmm5, 4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
paddw xmm0, xmm7
paddw xmm2, xmm3
psrlw xmm0, 4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15)
psrlw xmm2, 4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
psllw xmm0, BYTE_BIT
psllw xmm2, BYTE_BIT
por xmm1, xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15)
por xmm5, xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31)
movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1
movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5
; -- process the lower row
movdqa xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD]
movdqa xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD]
movdqa xmm7, xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7)
movdqa xmm3, xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15)
psrldq xmm7, 2 ; xmm7=( 1 2 3 4 5 6 7 --)
pslldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8)
movdqa xmm0, xmm6
movdqa xmm2, xmm4
psrldq xmm0, (SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
pslldq xmm2, 2 ; xmm2=(-- 8 9 10 11 12 13 14)
por xmm7, xmm3 ; xmm7=( 1 2 3 4 5 6 7 8)
por xmm0, xmm2 ; xmm0=( 7 8 9 10 11 12 13 14)
movdqa xmm1, xmm6
movdqa xmm5, xmm4
pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6)
psrldq xmm5, 2 ; xmm5=( 9 10 11 12 13 14 15 --)
movdqa xmm3, xmm4
psrldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6)
por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16)
movdqa XMMWORD [wk(1)], xmm3
pmullw xmm6, [GOTOFF(ebx,PW_THREE)]
pmullw xmm4, [GOTOFF(ebx,PW_THREE)]
paddw xmm1, [GOTOFF(ebx,PW_EIGHT)]
paddw xmm0, [GOTOFF(ebx,PW_EIGHT)]
paddw xmm7, [GOTOFF(ebx,PW_SEVEN)]
paddw xmm5, [GOTOFF(ebx,PW_SEVEN)]
paddw xmm1, xmm6
paddw xmm0, xmm4
psrlw xmm1, 4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14)
psrlw xmm0, 4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
paddw xmm7, xmm6
paddw xmm5, xmm4
psrlw xmm7, 4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15)
psrlw xmm5, 4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
psllw xmm7, BYTE_BIT
psllw xmm5, BYTE_BIT
por xmm1, xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15)
por xmm0, xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31)
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1
movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0
poppic ebx
sub eax, byte SIZEOF_XMMWORD
add ecx, byte 1*SIZEOF_XMMWORD ; inptr1(above)
add ebx, byte 1*SIZEOF_XMMWORD ; inptr0
add esi, byte 1*SIZEOF_XMMWORD ; inptr1(below)
add edx, byte 2*SIZEOF_XMMWORD ; outptr0
add edi, byte 2*SIZEOF_XMMWORD ; outptr1
cmp eax, byte SIZEOF_XMMWORD
ja near .columnloop
test eax, eax
jnz near .columnloop_last
pop esi
pop edi
pop ecx
pop eax
add esi, byte 1*SIZEOF_JSAMPROW ; input_data
add edi, byte 2*SIZEOF_JSAMPROW ; output_data
sub ecx, byte 2 ; rowctr
jg near .rowloop
.return:
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
; --------------------------------------------------------------------------
;
; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
; It's still a box filter.
;
; GLOBAL(void)
; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
;
%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
%define output_width(b) (b) + 12 ; JDIMENSION output_width
%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
align 32
GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2)
EXTN(jsimd_h2v1_upsample_sse2):
push ebp
mov ebp, esp
; push ebx ; unused
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
mov edx, JDIMENSION [output_width(ebp)]
add edx, byte (2*SIZEOF_XMMWORD)-1
and edx, byte -(2*SIZEOF_XMMWORD)
jz short .return
mov ecx, INT [max_v_samp(ebp)] ; rowctr
test ecx, ecx
jz short .return
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, POINTER [output_data_ptr(ebp)]
mov edi, JSAMPARRAY [edi] ; output_data
alignx 16, 7
.rowloop:
push edi
push esi
mov esi, JSAMPROW [esi] ; inptr
mov edi, JSAMPROW [edi] ; outptr
mov eax, edx ; colctr
alignx 16, 7
.columnloop:
movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqa xmm1, xmm0
punpcklbw xmm0, xmm0
punpckhbw xmm1, xmm1
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
sub eax, byte 2*SIZEOF_XMMWORD
jz short .nextrow
movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
movdqa xmm3, xmm2
punpcklbw xmm2, xmm2
punpckhbw xmm3, xmm3
movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
sub eax, byte 2*SIZEOF_XMMWORD
jz short .nextrow
add esi, byte 2*SIZEOF_XMMWORD ; inptr
add edi, byte 4*SIZEOF_XMMWORD ; outptr
jmp short .columnloop
alignx 16, 7
.nextrow:
pop esi
pop edi
add esi, byte SIZEOF_JSAMPROW ; input_data
add edi, byte SIZEOF_JSAMPROW ; output_data
dec ecx ; rowctr
jg short .rowloop
.return:
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
; pop ebx ; unused
pop ebp
ret
; --------------------------------------------------------------------------
;
; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
; It's still a box filter.
;
; GLOBAL(void)
; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
;
%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
%define output_width(b) (b) + 12 ; JDIMENSION output_width
%define input_data(b) (b) + 16 ; JSAMPARRAY input_data
%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
align 32
GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2)
EXTN(jsimd_h2v2_upsample_sse2):
push ebp
mov ebp, esp
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
mov edx, JDIMENSION [output_width(ebp)]
add edx, byte (2*SIZEOF_XMMWORD)-1
and edx, byte -(2*SIZEOF_XMMWORD)
jz near .return
mov ecx, INT [max_v_samp(ebp)] ; rowctr
test ecx, ecx
jz near .return
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, POINTER [output_data_ptr(ebp)]
mov edi, JSAMPARRAY [edi] ; output_data
alignx 16, 7
.rowloop:
push edi
push esi
mov esi, JSAMPROW [esi] ; inptr
mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
mov eax, edx ; colctr
alignx 16, 7
.columnloop:
movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqa xmm1, xmm0
punpcklbw xmm0, xmm0
punpckhbw xmm1, xmm1
movdqa XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
movdqa XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
sub eax, byte 2*SIZEOF_XMMWORD
jz short .nextrow
movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
movdqa xmm3, xmm2
punpcklbw xmm2, xmm2
punpckhbw xmm3, xmm3
movdqa XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2
movdqa XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3
movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
sub eax, byte 2*SIZEOF_XMMWORD
jz short .nextrow
add esi, byte 2*SIZEOF_XMMWORD ; inptr
add ebx, byte 4*SIZEOF_XMMWORD ; outptr0
add edi, byte 4*SIZEOF_XMMWORD ; outptr1
jmp short .columnloop
alignx 16, 7
.nextrow:
pop esi
pop edi
add esi, byte 1*SIZEOF_JSAMPROW ; input_data
add edi, byte 2*SIZEOF_JSAMPROW ; output_data
sub ecx, byte 2 ; rowctr
jg short .rowloop
.return:
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 32

@ -0,0 +1,318 @@
;
; jfdctflt.asm - floating-point FDCT (3DNow!)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
; This file contains a floating-point implementation of the forward DCT
; (Discrete Cosine Transform). The following code is based directly on
; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
%include "jsimdext.inc"
%include "jdct.inc"
; --------------------------------------------------------------------------
SECTION SEG_CONST
alignz 32
GLOBAL_DATA(jconst_fdct_float_3dnow)
EXTN(jconst_fdct_float_3dnow):
PD_0_382 times 2 dd 0.382683432365089771728460
PD_0_707 times 2 dd 0.707106781186547524400844
PD_0_541 times 2 dd 0.541196100146196984399723
PD_1_306 times 2 dd 1.306562964876376527856643
alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
;
; Perform the forward DCT on one block of samples.
;
; GLOBAL(void)
; jsimd_fdct_float_3dnow(FAST_FLOAT *data)
;
%define data(b) (b) + 8 ; FAST_FLOAT *data
%define original_ebp ebp + 0
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM]
%define WK_NUM 2
align 32
GLOBAL_FUNCTION(jsimd_fdct_float_3dnow)
EXTN(jsimd_fdct_float_3dnow):
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
; push esi ; unused
; push edi ; unused
get_GOT ebx ; get GOT address
; ---- Pass 1: process rows.
mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
mov ecx, DCTSIZE/2
alignx 16, 7
.rowloop:
movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
movq mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
movq mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17)
movq mm4, mm0 ; transpose coefficients
punpckldq mm0, mm1 ; mm0=(00 10)=data0
punpckhdq mm4, mm1 ; mm4=(01 11)=data1
movq mm5, mm2 ; transpose coefficients
punpckldq mm2, mm3 ; mm2=(06 16)=data6
punpckhdq mm5, mm3 ; mm5=(07 17)=data7
movq mm6, mm4
movq mm7, mm0
pfsub mm4, mm2 ; mm4=data1-data6=tmp6
pfsub mm0, mm5 ; mm0=data0-data7=tmp7
pfadd mm6, mm2 ; mm6=data1+data6=tmp1
pfadd mm7, mm5 ; mm7=data0+data7=tmp0
movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
movq mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
movq mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15)
movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6
movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7
movq mm4, mm1 ; transpose coefficients
punpckldq mm1, mm3 ; mm1=(02 12)=data2
punpckhdq mm4, mm3 ; mm4=(03 13)=data3
movq mm0, mm2 ; transpose coefficients
punpckldq mm2, mm5 ; mm2=(04 14)=data4
punpckhdq mm0, mm5 ; mm0=(05 15)=data5
movq mm3, mm4
movq mm5, mm1
pfadd mm4, mm2 ; mm4=data3+data4=tmp3
pfadd mm1, mm0 ; mm1=data2+data5=tmp2
pfsub mm3, mm2 ; mm3=data3-data4=tmp4
pfsub mm5, mm0 ; mm5=data2-data5=tmp5
; -- Even part
movq mm2, mm7
movq mm0, mm6
pfsub mm7, mm4 ; mm7=tmp13
pfsub mm6, mm1 ; mm6=tmp12
pfadd mm2, mm4 ; mm2=tmp10
pfadd mm0, mm1 ; mm0=tmp11
pfadd mm6, mm7
pfmul mm6, [GOTOFF(ebx,PD_0_707)] ; mm6=z1
movq mm4, mm2
movq mm1, mm7
pfsub mm2, mm0 ; mm2=data4
pfsub mm7, mm6 ; mm7=data6
pfadd mm4, mm0 ; mm4=data0
pfadd mm1, mm6 ; mm1=data2
movq MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2
movq MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7
movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1
; -- Odd part
movq mm0, MMWORD [wk(0)] ; mm0=tmp6
movq mm6, MMWORD [wk(1)] ; mm6=tmp7
pfadd mm3, mm5 ; mm3=tmp10
pfadd mm5, mm0 ; mm5=tmp11
pfadd mm0, mm6 ; mm0=tmp12, mm6=tmp7
pfmul mm5, [GOTOFF(ebx,PD_0_707)] ; mm5=z3
movq mm2, mm3 ; mm2=tmp10
pfsub mm3, mm0
pfmul mm3, [GOTOFF(ebx,PD_0_382)] ; mm3=z5
pfmul mm2, [GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
pfmul mm0, [GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
pfadd mm2, mm3 ; mm2=z2
pfadd mm0, mm3 ; mm0=z4
movq mm7, mm6
pfsub mm6, mm5 ; mm6=z13
pfadd mm7, mm5 ; mm7=z11
movq mm4, mm6
movq mm1, mm7
pfsub mm6, mm2 ; mm6=data3
pfsub mm7, mm0 ; mm7=data7
pfadd mm4, mm2 ; mm4=data5
pfadd mm1, mm0 ; mm1=data1
movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6
movq MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7
movq MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4
movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
add edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
dec ecx
jnz near .rowloop
; ---- Pass 2: process columns.
mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
mov ecx, DCTSIZE/2
alignx 16, 7
.columnloop:
movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71)
movq mm4, mm0 ; transpose coefficients
punpckldq mm0, mm1 ; mm0=(00 01)=data0
punpckhdq mm4, mm1 ; mm4=(10 11)=data1
movq mm5, mm2 ; transpose coefficients
punpckldq mm2, mm3 ; mm2=(60 61)=data6
punpckhdq mm5, mm3 ; mm5=(70 71)=data7
movq mm6, mm4
movq mm7, mm0
pfsub mm4, mm2 ; mm4=data1-data6=tmp6
pfsub mm0, mm5 ; mm0=data0-data7=tmp7
pfadd mm6, mm2 ; mm6=data1+data6=tmp1
pfadd mm7, mm5 ; mm7=data0+data7=tmp0
movq mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
movq mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
movq mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
movq mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51)
movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6
movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7
movq mm4, mm1 ; transpose coefficients
punpckldq mm1, mm3 ; mm1=(20 21)=data2
punpckhdq mm4, mm3 ; mm4=(30 31)=data3
movq mm0, mm2 ; transpose coefficients
punpckldq mm2, mm5 ; mm2=(40 41)=data4
punpckhdq mm0, mm5 ; mm0=(50 51)=data5
movq mm3, mm4
movq mm5, mm1
pfadd mm4, mm2 ; mm4=data3+data4=tmp3
pfadd mm1, mm0 ; mm1=data2+data5=tmp2
pfsub mm3, mm2 ; mm3=data3-data4=tmp4
pfsub mm5, mm0 ; mm5=data2-data5=tmp5
; -- Even part
movq mm2, mm7
movq mm0, mm6
pfsub mm7, mm4 ; mm7=tmp13
pfsub mm6, mm1 ; mm6=tmp12
pfadd mm2, mm4 ; mm2=tmp10
pfadd mm0, mm1 ; mm0=tmp11
pfadd mm6, mm7
pfmul mm6, [GOTOFF(ebx,PD_0_707)] ; mm6=z1
movq mm4, mm2
movq mm1, mm7
pfsub mm2, mm0 ; mm2=data4
pfsub mm7, mm6 ; mm7=data6
pfadd mm4, mm0 ; mm4=data0
pfadd mm1, mm6 ; mm1=data2
movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2
movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7
movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1
; -- Odd part
movq mm0, MMWORD [wk(0)] ; mm0=tmp6
movq mm6, MMWORD [wk(1)] ; mm6=tmp7
pfadd mm3, mm5 ; mm3=tmp10
pfadd mm5, mm0 ; mm5=tmp11
pfadd mm0, mm6 ; mm0=tmp12, mm6=tmp7
pfmul mm5, [GOTOFF(ebx,PD_0_707)] ; mm5=z3
movq mm2, mm3 ; mm2=tmp10
pfsub mm3, mm0
pfmul mm3, [GOTOFF(ebx,PD_0_382)] ; mm3=z5
pfmul mm2, [GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
pfmul mm0, [GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
pfadd mm2, mm3 ; mm2=z2
pfadd mm0, mm3 ; mm0=z4
movq mm7, mm6
pfsub mm6, mm5 ; mm6=z13
pfadd mm7, mm5 ; mm7=z11
movq mm4, mm6
movq mm1, mm7
pfsub mm6, mm2 ; mm6=data3
pfsub mm7, mm0 ; mm7=data7
pfadd mm4, mm2 ; mm4=data5
pfadd mm1, mm0 ; mm1=data1
movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6
movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7
movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4
movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
add edx, byte 2*SIZEOF_FAST_FLOAT
dec ecx
jnz near .columnloop
femms ; empty MMX/3DNow! state
; pop edi ; unused
; pop esi ; unused
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
poppic ebx
mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 32

@ -0,0 +1,369 @@
;
; jfdctflt.asm - floating-point FDCT (SSE)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
; This file contains a floating-point implementation of the forward DCT
; (Discrete Cosine Transform). The following code is based directly on
; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
%include "jsimdext.inc"
%include "jdct.inc"
; --------------------------------------------------------------------------
%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
shufps %1, %2, 0x44
%endmacro
%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
shufps %1, %2, 0xEE
%endmacro
; --------------------------------------------------------------------------
SECTION SEG_CONST
alignz 32
GLOBAL_DATA(jconst_fdct_float_sse)
EXTN(jconst_fdct_float_sse):
PD_0_382 times 4 dd 0.382683432365089771728460
PD_0_707 times 4 dd 0.707106781186547524400844
PD_0_541 times 4 dd 0.541196100146196984399723
PD_1_306 times 4 dd 1.306562964876376527856643
alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
;
; Perform the forward DCT on one block of samples.
;
; GLOBAL(void)
; jsimd_fdct_float_sse(FAST_FLOAT *data)
;
%define data(b) (b) + 8 ; FAST_FLOAT *data
%define original_ebp ebp + 0
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
; xmmword wk[WK_NUM]
%define WK_NUM 2
align 32
GLOBAL_FUNCTION(jsimd_fdct_float_sse)
EXTN(jsimd_fdct_float_sse):
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
; push esi ; unused
; push edi ; unused
get_GOT ebx ; get GOT address
; ---- Pass 1: process rows.
mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
mov ecx, DCTSIZE/4
alignx 16, 7
.rowloop:
movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)]
; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
movaps xmm4, xmm0 ; transpose coefficients(phase 1)
unpcklps xmm0, xmm1 ; xmm0=(20 30 21 31)
unpckhps xmm4, xmm1 ; xmm4=(22 32 23 33)
movaps xmm5, xmm2 ; transpose coefficients(phase 1)
unpcklps xmm2, xmm3 ; xmm2=(24 34 25 35)
unpckhps xmm5, xmm3 ; xmm5=(26 36 27 37)
movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33)
movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35)
movaps xmm4, xmm6 ; transpose coefficients(phase 1)
unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
unpckhps xmm4, xmm7 ; xmm4=(02 12 03 13)
movaps xmm2, xmm1 ; transpose coefficients(phase 1)
unpcklps xmm1, xmm3 ; xmm1=(04 14 05 15)
unpckhps xmm2, xmm3 ; xmm2=(06 16 07 17)
movaps xmm7, xmm6 ; transpose coefficients(phase 2)
unpcklps2 xmm6, xmm0 ; xmm6=(00 10 20 30)=data0
unpckhps2 xmm7, xmm0 ; xmm7=(01 11 21 31)=data1
movaps xmm3, xmm2 ; transpose coefficients(phase 2)
unpcklps2 xmm2, xmm5 ; xmm2=(06 16 26 36)=data6
unpckhps2 xmm3, xmm5 ; xmm3=(07 17 27 37)=data7
movaps xmm0, xmm7
movaps xmm5, xmm6
subps xmm7, xmm2 ; xmm7=data1-data6=tmp6
subps xmm6, xmm3 ; xmm6=data0-data7=tmp7
addps xmm0, xmm2 ; xmm0=data1+data6=tmp1
addps xmm5, xmm3 ; xmm5=data0+data7=tmp0
movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33)
movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35)
movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
movaps xmm7, xmm4 ; transpose coefficients(phase 2)
unpcklps2 xmm4, xmm2 ; xmm4=(02 12 22 32)=data2
unpckhps2 xmm7, xmm2 ; xmm7=(03 13 23 33)=data3
movaps xmm6, xmm1 ; transpose coefficients(phase 2)
unpcklps2 xmm1, xmm3 ; xmm1=(04 14 24 34)=data4
unpckhps2 xmm6, xmm3 ; xmm6=(05 15 25 35)=data5
movaps xmm2, xmm7
movaps xmm3, xmm4
addps xmm7, xmm1 ; xmm7=data3+data4=tmp3
addps xmm4, xmm6 ; xmm4=data2+data5=tmp2
subps xmm2, xmm1 ; xmm2=data3-data4=tmp4
subps xmm3, xmm6 ; xmm3=data2-data5=tmp5
; -- Even part
movaps xmm1, xmm5
movaps xmm6, xmm0
subps xmm5, xmm7 ; xmm5=tmp13
subps xmm0, xmm4 ; xmm0=tmp12
addps xmm1, xmm7 ; xmm1=tmp10
addps xmm6, xmm4 ; xmm6=tmp11
addps xmm0, xmm5
mulps xmm0, [GOTOFF(ebx,PD_0_707)] ; xmm0=z1
movaps xmm7, xmm1
movaps xmm4, xmm5
subps xmm1, xmm6 ; xmm1=data4
subps xmm5, xmm0 ; xmm5=data6
addps xmm7, xmm6 ; xmm7=data0
addps xmm4, xmm0 ; xmm4=data2
movaps XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
; -- Odd part
movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
addps xmm2, xmm3 ; xmm2=tmp10
addps xmm3, xmm6 ; xmm3=tmp11
addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7
mulps xmm3, [GOTOFF(ebx,PD_0_707)] ; xmm3=z3
movaps xmm1, xmm2 ; xmm1=tmp10
subps xmm2, xmm6
mulps xmm2, [GOTOFF(ebx,PD_0_382)] ; xmm2=z5
mulps xmm1, [GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
mulps xmm6, [GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
addps xmm1, xmm2 ; xmm1=z2
addps xmm6, xmm2 ; xmm6=z4
movaps xmm5, xmm0
subps xmm0, xmm3 ; xmm0=z13
addps xmm5, xmm3 ; xmm5=z11
movaps xmm7, xmm0
movaps xmm4, xmm5
subps xmm0, xmm1 ; xmm0=data3
subps xmm5, xmm6 ; xmm5=data7
addps xmm7, xmm1 ; xmm7=data5
addps xmm4, xmm6 ; xmm4=data1
movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7
movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
add edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
dec ecx
jnz near .rowloop
; ---- Pass 2: process columns.
mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
mov ecx, DCTSIZE/4
alignx 16, 7
.columnloop:
movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
movaps xmm4, xmm0 ; transpose coefficients(phase 1)
unpcklps xmm0, xmm1 ; xmm0=(02 03 12 13)
unpckhps xmm4, xmm1 ; xmm4=(22 23 32 33)
movaps xmm5, xmm2 ; transpose coefficients(phase 1)
unpcklps xmm2, xmm3 ; xmm2=(42 43 52 53)
unpckhps xmm5, xmm3 ; xmm5=(62 63 72 73)
movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
movaps xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33)
movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53)
movaps xmm4, xmm6 ; transpose coefficients(phase 1)
unpcklps xmm6, xmm7 ; xmm6=(00 01 10 11)
unpckhps xmm4, xmm7 ; xmm4=(20 21 30 31)
movaps xmm2, xmm1 ; transpose coefficients(phase 1)
unpcklps xmm1, xmm3 ; xmm1=(40 41 50 51)
unpckhps xmm2, xmm3 ; xmm2=(60 61 70 71)
movaps xmm7, xmm6 ; transpose coefficients(phase 2)
unpcklps2 xmm6, xmm0 ; xmm6=(00 01 02 03)=data0
unpckhps2 xmm7, xmm0 ; xmm7=(10 11 12 13)=data1
movaps xmm3, xmm2 ; transpose coefficients(phase 2)
unpcklps2 xmm2, xmm5 ; xmm2=(60 61 62 63)=data6
unpckhps2 xmm3, xmm5 ; xmm3=(70 71 72 73)=data7
movaps xmm0, xmm7
movaps xmm5, xmm6
subps xmm7, xmm2 ; xmm7=data1-data6=tmp6
subps xmm6, xmm3 ; xmm6=data0-data7=tmp7
addps xmm0, xmm2 ; xmm0=data1+data6=tmp1
addps xmm5, xmm3 ; xmm5=data0+data7=tmp0
movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33)
movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53)
movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
movaps xmm7, xmm4 ; transpose coefficients(phase 2)
unpcklps2 xmm4, xmm2 ; xmm4=(20 21 22 23)=data2
unpckhps2 xmm7, xmm2 ; xmm7=(30 31 32 33)=data3
movaps xmm6, xmm1 ; transpose coefficients(phase 2)
unpcklps2 xmm1, xmm3 ; xmm1=(40 41 42 43)=data4
unpckhps2 xmm6, xmm3 ; xmm6=(50 51 52 53)=data5
movaps xmm2, xmm7
movaps xmm3, xmm4
addps xmm7, xmm1 ; xmm7=data3+data4=tmp3
addps xmm4, xmm6 ; xmm4=data2+data5=tmp2
subps xmm2, xmm1 ; xmm2=data3-data4=tmp4
subps xmm3, xmm6 ; xmm3=data2-data5=tmp5
; -- Even part
movaps xmm1, xmm5
movaps xmm6, xmm0
subps xmm5, xmm7 ; xmm5=tmp13
subps xmm0, xmm4 ; xmm0=tmp12
addps xmm1, xmm7 ; xmm1=tmp10
addps xmm6, xmm4 ; xmm6=tmp11
addps xmm0, xmm5
mulps xmm0, [GOTOFF(ebx,PD_0_707)] ; xmm0=z1
movaps xmm7, xmm1
movaps xmm4, xmm5
subps xmm1, xmm6 ; xmm1=data4
subps xmm5, xmm0 ; xmm5=data6
addps xmm7, xmm6 ; xmm7=data0
addps xmm4, xmm0 ; xmm4=data2
movaps XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1
movaps XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
; -- Odd part
movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
addps xmm2, xmm3 ; xmm2=tmp10
addps xmm3, xmm6 ; xmm3=tmp11
addps xmm6, xmm0 ; xmm6=tmp12, xmm0=tmp7
mulps xmm3, [GOTOFF(ebx,PD_0_707)] ; xmm3=z3
movaps xmm1, xmm2 ; xmm1=tmp10
subps xmm2, xmm6
mulps xmm2, [GOTOFF(ebx,PD_0_382)] ; xmm2=z5
mulps xmm1, [GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
mulps xmm6, [GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
addps xmm1, xmm2 ; xmm1=z2
addps xmm6, xmm2 ; xmm6=z4
movaps xmm5, xmm0
subps xmm0, xmm3 ; xmm0=z13
addps xmm5, xmm3 ; xmm5=z11
movaps xmm7, xmm0
movaps xmm4, xmm5
subps xmm0, xmm1 ; xmm0=data3
subps xmm5, xmm6 ; xmm5=data7
addps xmm7, xmm1 ; xmm7=data5
addps xmm4, xmm6 ; xmm4=data1
movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
movaps XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7
movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
add edx, byte 4*SIZEOF_FAST_FLOAT
dec ecx
jnz near .columnloop
; pop edi ; unused
; pop esi ; unused
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
poppic ebx
mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 32

@ -0,0 +1,395 @@
;
; jfdctfst.asm - fast integer FDCT (MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
; This file contains a fast, not so accurate integer implementation of
; the forward DCT (Discrete Cosine Transform). The following code is
; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
; for more details.
%include "jsimdext.inc"
%include "jdct.inc"
; --------------------------------------------------------------------------
%define CONST_BITS 8 ; 14 is also OK.
%if CONST_BITS == 8
F_0_382 equ 98 ; FIX(0.382683433)
F_0_541 equ 139 ; FIX(0.541196100)
F_0_707 equ 181 ; FIX(0.707106781)
F_1_306 equ 334 ; FIX(1.306562965)
%else
; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
F_0_382 equ DESCALE( 410903207, 30 - CONST_BITS) ; FIX(0.382683433)
F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
F_0_707 equ DESCALE( 759250124, 30 - CONST_BITS) ; FIX(0.707106781)
F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS) ; FIX(1.306562965)
%endif
; --------------------------------------------------------------------------
SECTION SEG_CONST
; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
%define PRE_MULTIPLY_SCALE_BITS 2
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
alignz 32
GLOBAL_DATA(jconst_fdct_ifast_mmx)
EXTN(jconst_fdct_ifast_mmx):
PW_F0707 times 4 dw F_0_707 << CONST_SHIFT
PW_F0382 times 4 dw F_0_382 << CONST_SHIFT
PW_F0541 times 4 dw F_0_541 << CONST_SHIFT
PW_F1306 times 4 dw F_1_306 << CONST_SHIFT
alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
;
; Perform the forward DCT on one block of samples.
;
; GLOBAL(void)
; jsimd_fdct_ifast_mmx(DCTELEM *data)
;
%define data(b) (b) + 8 ; DCTELEM *data
%define original_ebp ebp + 0
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM]
%define WK_NUM 2
align 32
GLOBAL_FUNCTION(jsimd_fdct_ifast_mmx)
EXTN(jsimd_fdct_ifast_mmx):
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
; push esi ; unused
; push edi ; unused
get_GOT ebx ; get GOT address
; ---- Pass 1: process rows.
mov edx, POINTER [data(eax)] ; (DCTELEM *)
mov ecx, DCTSIZE/4
alignx 16, 7
.rowloop:
movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
; mm0=(20 21 22 23), mm2=(24 25 26 27)
; mm1=(30 31 32 33), mm3=(34 35 36 37)
movq mm4, mm0 ; transpose coefficients(phase 1)
punpcklwd mm0, mm1 ; mm0=(20 30 21 31)
punpckhwd mm4, mm1 ; mm4=(22 32 23 33)
movq mm5, mm2 ; transpose coefficients(phase 1)
punpcklwd mm2, mm3 ; mm2=(24 34 25 35)
punpckhwd mm5, mm3 ; mm5=(26 36 27 37)
movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
; mm6=(00 01 02 03), mm1=(04 05 06 07)
; mm7=(10 11 12 13), mm3=(14 15 16 17)
movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33)
movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35)
movq mm4, mm6 ; transpose coefficients(phase 1)
punpcklwd mm6, mm7 ; mm6=(00 10 01 11)
punpckhwd mm4, mm7 ; mm4=(02 12 03 13)
movq mm2, mm1 ; transpose coefficients(phase 1)
punpcklwd mm1, mm3 ; mm1=(04 14 05 15)
punpckhwd mm2, mm3 ; mm2=(06 16 07 17)
movq mm7, mm6 ; transpose coefficients(phase 2)
punpckldq mm6, mm0 ; mm6=(00 10 20 30)=data0
punpckhdq mm7, mm0 ; mm7=(01 11 21 31)=data1
movq mm3, mm2 ; transpose coefficients(phase 2)
punpckldq mm2, mm5 ; mm2=(06 16 26 36)=data6
punpckhdq mm3, mm5 ; mm3=(07 17 27 37)=data7
movq mm0, mm7
movq mm5, mm6
psubw mm7, mm2 ; mm7=data1-data6=tmp6
psubw mm6, mm3 ; mm6=data0-data7=tmp7
paddw mm0, mm2 ; mm0=data1+data6=tmp1
paddw mm5, mm3 ; mm5=data0+data7=tmp0
movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33)
movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35)
movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6
movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7
movq mm7, mm4 ; transpose coefficients(phase 2)
punpckldq mm4, mm2 ; mm4=(02 12 22 32)=data2
punpckhdq mm7, mm2 ; mm7=(03 13 23 33)=data3
movq mm6, mm1 ; transpose coefficients(phase 2)
punpckldq mm1, mm3 ; mm1=(04 14 24 34)=data4
punpckhdq mm6, mm3 ; mm6=(05 15 25 35)=data5
movq mm2, mm7
movq mm3, mm4
paddw mm7, mm1 ; mm7=data3+data4=tmp3
paddw mm4, mm6 ; mm4=data2+data5=tmp2
psubw mm2, mm1 ; mm2=data3-data4=tmp4
psubw mm3, mm6 ; mm3=data2-data5=tmp5
; -- Even part
movq mm1, mm5
movq mm6, mm0
psubw mm5, mm7 ; mm5=tmp13
psubw mm0, mm4 ; mm0=tmp12
paddw mm1, mm7 ; mm1=tmp10
paddw mm6, mm4 ; mm6=tmp11
paddw mm0, mm5
psllw mm0, PRE_MULTIPLY_SCALE_BITS
pmulhw mm0, [GOTOFF(ebx,PW_F0707)] ; mm0=z1
movq mm7, mm1
movq mm4, mm5
psubw mm1, mm6 ; mm1=data4
psubw mm5, mm0 ; mm5=data6
paddw mm7, mm6 ; mm7=data0
paddw mm4, mm0 ; mm4=data2
movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm1
movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm5
movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
; -- Odd part
movq mm6, MMWORD [wk(0)] ; mm6=tmp6
movq mm0, MMWORD [wk(1)] ; mm0=tmp7
paddw mm2, mm3 ; mm2=tmp10
paddw mm3, mm6 ; mm3=tmp11
paddw mm6, mm0 ; mm6=tmp12, mm0=tmp7
psllw mm2, PRE_MULTIPLY_SCALE_BITS
psllw mm6, PRE_MULTIPLY_SCALE_BITS
psllw mm3, PRE_MULTIPLY_SCALE_BITS
pmulhw mm3, [GOTOFF(ebx,PW_F0707)] ; mm3=z3
movq mm1, mm2 ; mm1=tmp10
psubw mm2, mm6
pmulhw mm2, [GOTOFF(ebx,PW_F0382)] ; mm2=z5
pmulhw mm1, [GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
pmulhw mm6, [GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
paddw mm1, mm2 ; mm1=z2
paddw mm6, mm2 ; mm6=z4
movq mm5, mm0
psubw mm0, mm3 ; mm0=z13
paddw mm5, mm3 ; mm5=z11
movq mm7, mm0
movq mm4, mm5
psubw mm0, mm1 ; mm0=data3
psubw mm5, mm6 ; mm5=data7
paddw mm7, mm1 ; mm7=data5
paddw mm4, mm6 ; mm4=data1
movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm5
movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm7
movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
dec ecx
jnz near .rowloop
; ---- Pass 2: process columns.
mov edx, POINTER [data(eax)] ; (DCTELEM *)
mov ecx, DCTSIZE/4
alignx 16, 7
.columnloop:
movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
; mm0=(02 12 22 32), mm2=(42 52 62 72)
; mm1=(03 13 23 33), mm3=(43 53 63 73)
movq mm4, mm0 ; transpose coefficients(phase 1)
punpcklwd mm0, mm1 ; mm0=(02 03 12 13)
punpckhwd mm4, mm1 ; mm4=(22 23 32 33)
movq mm5, mm2 ; transpose coefficients(phase 1)
punpcklwd mm2, mm3 ; mm2=(42 43 52 53)
punpckhwd mm5, mm3 ; mm5=(62 63 72 73)
movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
; mm6=(00 10 20 30), mm1=(40 50 60 70)
; mm7=(01 11 21 31), mm3=(41 51 61 71)
movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33)
movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53)
movq mm4, mm6 ; transpose coefficients(phase 1)
punpcklwd mm6, mm7 ; mm6=(00 01 10 11)
punpckhwd mm4, mm7 ; mm4=(20 21 30 31)
movq mm2, mm1 ; transpose coefficients(phase 1)
punpcklwd mm1, mm3 ; mm1=(40 41 50 51)
punpckhwd mm2, mm3 ; mm2=(60 61 70 71)
movq mm7, mm6 ; transpose coefficients(phase 2)
punpckldq mm6, mm0 ; mm6=(00 01 02 03)=data0
punpckhdq mm7, mm0 ; mm7=(10 11 12 13)=data1
movq mm3, mm2 ; transpose coefficients(phase 2)
punpckldq mm2, mm5 ; mm2=(60 61 62 63)=data6
punpckhdq mm3, mm5 ; mm3=(70 71 72 73)=data7
movq mm0, mm7
movq mm5, mm6
psubw mm7, mm2 ; mm7=data1-data6=tmp6
psubw mm6, mm3 ; mm6=data0-data7=tmp7
paddw mm0, mm2 ; mm0=data1+data6=tmp1
paddw mm5, mm3 ; mm5=data0+data7=tmp0
movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33)
movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53)
movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6
movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7
movq mm7, mm4 ; transpose coefficients(phase 2)
punpckldq mm4, mm2 ; mm4=(20 21 22 23)=data2
punpckhdq mm7, mm2 ; mm7=(30 31 32 33)=data3
movq mm6, mm1 ; transpose coefficients(phase 2)
punpckldq mm1, mm3 ; mm1=(40 41 42 43)=data4
punpckhdq mm6, mm3 ; mm6=(50 51 52 53)=data5
movq mm2, mm7
movq mm3, mm4
paddw mm7, mm1 ; mm7=data3+data4=tmp3
paddw mm4, mm6 ; mm4=data2+data5=tmp2
psubw mm2, mm1 ; mm2=data3-data4=tmp4
psubw mm3, mm6 ; mm3=data2-data5=tmp5
; -- Even part
movq mm1, mm5
movq mm6, mm0
psubw mm5, mm7 ; mm5=tmp13
psubw mm0, mm4 ; mm0=tmp12
paddw mm1, mm7 ; mm1=tmp10
paddw mm6, mm4 ; mm6=tmp11
paddw mm0, mm5
psllw mm0, PRE_MULTIPLY_SCALE_BITS
pmulhw mm0, [GOTOFF(ebx,PW_F0707)] ; mm0=z1
movq mm7, mm1
movq mm4, mm5
psubw mm1, mm6 ; mm1=data4
psubw mm5, mm0 ; mm5=data6
paddw mm7, mm6 ; mm7=data0
paddw mm4, mm0 ; mm4=data2
movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm1
movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm5
movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
; -- Odd part
movq mm6, MMWORD [wk(0)] ; mm6=tmp6
movq mm0, MMWORD [wk(1)] ; mm0=tmp7
paddw mm2, mm3 ; mm2=tmp10
paddw mm3, mm6 ; mm3=tmp11
paddw mm6, mm0 ; mm6=tmp12, mm0=tmp7
psllw mm2, PRE_MULTIPLY_SCALE_BITS
psllw mm6, PRE_MULTIPLY_SCALE_BITS
psllw mm3, PRE_MULTIPLY_SCALE_BITS
pmulhw mm3, [GOTOFF(ebx,PW_F0707)] ; mm3=z3
movq mm1, mm2 ; mm1=tmp10
psubw mm2, mm6
pmulhw mm2, [GOTOFF(ebx,PW_F0382)] ; mm2=z5
pmulhw mm1, [GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
pmulhw mm6, [GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
paddw mm1, mm2 ; mm1=z2
paddw mm6, mm2 ; mm6=z4
movq mm5, mm0
psubw mm0, mm3 ; mm0=z13
paddw mm5, mm3 ; mm5=z11
movq mm7, mm0
movq mm4, mm5
psubw mm0, mm1 ; mm0=data3
psubw mm5, mm6 ; mm5=data7
paddw mm7, mm1 ; mm7=data5
paddw mm4, mm6 ; mm4=data1
movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm5
movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm7
movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
add edx, byte 4*SIZEOF_DCTELEM
dec ecx
jnz near .columnloop
emms ; empty MMX state
; pop edi ; unused
; pop esi ; unused
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
poppic ebx
mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 32

@ -0,0 +1,403 @@
;
; jfdctfst.asm - fast integer FDCT (SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
; This file contains a fast, not so accurate integer implementation of
; the forward DCT (Discrete Cosine Transform). The following code is
; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
; for more details.
%include "jsimdext.inc"
%include "jdct.inc"
; --------------------------------------------------------------------------
%define CONST_BITS 8 ; 14 is also OK.
%if CONST_BITS == 8
F_0_382 equ 98 ; FIX(0.382683433)
F_0_541 equ 139 ; FIX(0.541196100)
F_0_707 equ 181 ; FIX(0.707106781)
F_1_306 equ 334 ; FIX(1.306562965)
%else
; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
F_0_382 equ DESCALE( 410903207, 30 - CONST_BITS) ; FIX(0.382683433)
F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
F_0_707 equ DESCALE( 759250124, 30 - CONST_BITS) ; FIX(0.707106781)
F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS) ; FIX(1.306562965)
%endif
; --------------------------------------------------------------------------
SECTION SEG_CONST
; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
%define PRE_MULTIPLY_SCALE_BITS 2
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
alignz 32
GLOBAL_DATA(jconst_fdct_ifast_sse2)
EXTN(jconst_fdct_ifast_sse2):
PW_F0707 times 8 dw F_0_707 << CONST_SHIFT
PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
;
; Perform the forward DCT on one block of samples.
;
; GLOBAL(void)
; jsimd_fdct_ifast_sse2(DCTELEM *data)
;
%define data(b) (b) + 8 ; DCTELEM *data
%define original_ebp ebp + 0
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
; xmmword wk[WK_NUM]
%define WK_NUM 2
align 32
GLOBAL_FUNCTION(jsimd_fdct_ifast_sse2)
EXTN(jsimd_fdct_ifast_sse2):
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic ebx
; push ecx ; unused
; push edx ; need not be preserved
; push esi ; unused
; push edi ; unused
get_GOT ebx ; get GOT address
; ---- Pass 1: process rows.
mov edx, POINTER [data(eax)] ; (DCTELEM *)
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
movdqa xmm4, xmm0 ; transpose coefficients(phase 1)
punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
movdqa xmm5, xmm2 ; transpose coefficients(phase 1)
punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
movdqa xmm5, xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
movdqa xmm7, xmm6 ; transpose coefficients(phase 2)
punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
movdqa xmm3, xmm2 ; transpose coefficients(phase 2)
punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73)
movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75)
movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
movdqa xmm2, xmm4 ; transpose coefficients(phase 2)
punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
movdqa xmm1, xmm0 ; transpose coefficients(phase 3)
punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
movdqa xmm5, xmm2 ; transpose coefficients(phase 3)
punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
movdqa xmm6, xmm1
movdqa xmm3, xmm0
psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6
psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7
paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1
paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0
movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73)
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75)
movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
movdqa xmm0, xmm4 ; transpose coefficients(phase 3)
punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
movdqa xmm2, xmm1
movdqa xmm5, xmm7
paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3
paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2
psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4
psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5
; -- Even part
movdqa xmm4, xmm3
movdqa xmm0, xmm6
psubw xmm3, xmm1 ; xmm3=tmp13
psubw xmm6, xmm7 ; xmm6=tmp12
paddw xmm4, xmm1 ; xmm4=tmp10
paddw xmm0, xmm7 ; xmm0=tmp11
paddw xmm6, xmm3
psllw xmm6, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm6, [GOTOFF(ebx,PW_F0707)] ; xmm6=z1
movdqa xmm1, xmm4
movdqa xmm7, xmm3
psubw xmm4, xmm0 ; xmm4=data4
psubw xmm3, xmm6 ; xmm3=data6
paddw xmm1, xmm0 ; xmm1=data0
paddw xmm7, xmm6 ; xmm7=data2
movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6
; -- Odd part
paddw xmm2, xmm5 ; xmm2=tmp10
paddw xmm5, xmm0 ; xmm5=tmp11
paddw xmm0, xmm6 ; xmm0=tmp12, xmm6=tmp7
psllw xmm2, PRE_MULTIPLY_SCALE_BITS
psllw xmm0, PRE_MULTIPLY_SCALE_BITS
psllw xmm5, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm5, [GOTOFF(ebx,PW_F0707)] ; xmm5=z3
movdqa xmm4, xmm2 ; xmm4=tmp10
psubw xmm2, xmm0
pmulhw xmm2, [GOTOFF(ebx,PW_F0382)] ; xmm2=z5
pmulhw xmm4, [GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
pmulhw xmm0, [GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
paddw xmm4, xmm2 ; xmm4=z2
paddw xmm0, xmm2 ; xmm0=z4
movdqa xmm3, xmm6
psubw xmm6, xmm5 ; xmm6=z13
paddw xmm3, xmm5 ; xmm3=z11
movdqa xmm2, xmm6
movdqa xmm5, xmm3
psubw xmm6, xmm4 ; xmm6=data3
psubw xmm3, xmm0 ; xmm3=data7
paddw xmm2, xmm4 ; xmm2=data5
paddw xmm5, xmm0 ; xmm5=data1
; ---- Pass 2: process columns.
; mov edx, POINTER [data(eax)] ; (DCTELEM *)
; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
movdqa xmm4, xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1, xmm5 ; xmm1=(00 01 10 11 20 21 30 31)
punpckhwd xmm4, xmm5 ; xmm4=(40 41 50 51 60 61 70 71)
movdqa xmm0, xmm7 ; transpose coefficients(phase 1)
punpcklwd xmm7, xmm6 ; xmm7=(02 03 12 13 22 23 32 33)
punpckhwd xmm0, xmm6 ; xmm0=(42 43 52 53 62 63 72 73)
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6
; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33)
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73)
movdqa xmm7, xmm5 ; transpose coefficients(phase 1)
punpcklwd xmm5, xmm2 ; xmm5=(04 05 14 15 24 25 34 35)
punpckhwd xmm7, xmm2 ; xmm7=(44 45 54 55 64 65 74 75)
movdqa xmm0, xmm6 ; transpose coefficients(phase 1)
punpcklwd xmm6, xmm3 ; xmm6=(06 07 16 17 26 27 36 37)
punpckhwd xmm0, xmm3 ; xmm0=(46 47 56 57 66 67 76 77)
movdqa xmm2, xmm5 ; transpose coefficients(phase 2)
punpckldq xmm5, xmm6 ; xmm5=(04 05 06 07 14 15 16 17)
punpckhdq xmm2, xmm6 ; xmm2=(24 25 26 27 34 35 36 37)
movdqa xmm3, xmm7 ; transpose coefficients(phase 2)
punpckldq xmm7, xmm0 ; xmm7=(44 45 46 47 54 55 56 57)
punpckhdq xmm3, xmm0 ; xmm3=(64 65 66 67 74 75 76 77)
movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33)
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73)
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37)
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57)
movdqa xmm2, xmm1 ; transpose coefficients(phase 2)
punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 10 11 12 13)
punpckhdq xmm2, xmm6 ; xmm2=(20 21 22 23 30 31 32 33)
movdqa xmm7, xmm4 ; transpose coefficients(phase 2)
punpckldq xmm4, xmm0 ; xmm4=(40 41 42 43 50 51 52 53)
punpckhdq xmm7, xmm0 ; xmm7=(60 61 62 63 70 71 72 73)
movdqa xmm6, xmm1 ; transpose coefficients(phase 3)
punpcklqdq xmm1, xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0
punpckhqdq xmm6, xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1
movdqa xmm0, xmm7 ; transpose coefficients(phase 3)
punpcklqdq xmm7, xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6
punpckhqdq xmm0, xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7
movdqa xmm5, xmm6
movdqa xmm3, xmm1
psubw xmm6, xmm7 ; xmm6=data1-data6=tmp6
psubw xmm1, xmm0 ; xmm1=data0-data7=tmp7
paddw xmm5, xmm7 ; xmm5=data1+data6=tmp1
paddw xmm3, xmm0 ; xmm3=data0+data7=tmp0
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37)
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57)
movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7
movdqa xmm6, xmm2 ; transpose coefficients(phase 3)
punpcklqdq xmm2, xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2
punpckhqdq xmm6, xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3
movdqa xmm1, xmm4 ; transpose coefficients(phase 3)
punpcklqdq xmm4, xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4
punpckhqdq xmm1, xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5
movdqa xmm7, xmm6
movdqa xmm0, xmm2
paddw xmm6, xmm4 ; xmm6=data3+data4=tmp3
paddw xmm2, xmm1 ; xmm2=data2+data5=tmp2
psubw xmm7, xmm4 ; xmm7=data3-data4=tmp4
psubw xmm0, xmm1 ; xmm0=data2-data5=tmp5
; -- Even part
movdqa xmm4, xmm3
movdqa xmm1, xmm5
psubw xmm3, xmm6 ; xmm3=tmp13
psubw xmm5, xmm2 ; xmm5=tmp12
paddw xmm4, xmm6 ; xmm4=tmp10
paddw xmm1, xmm2 ; xmm1=tmp11
paddw xmm5, xmm3
psllw xmm5, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm5, [GOTOFF(ebx,PW_F0707)] ; xmm5=z1
movdqa xmm6, xmm4
movdqa xmm2, xmm3
psubw xmm4, xmm1 ; xmm4=data4
psubw xmm3, xmm5 ; xmm3=data6
paddw xmm6, xmm1 ; xmm6=data0
paddw xmm2, xmm5 ; xmm2=data2
movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4
movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3
movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6
movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2
; -- Odd part
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
paddw xmm7, xmm0 ; xmm7=tmp10
paddw xmm0, xmm1 ; xmm0=tmp11
paddw xmm1, xmm5 ; xmm1=tmp12, xmm5=tmp7
psllw xmm7, PRE_MULTIPLY_SCALE_BITS
psllw xmm1, PRE_MULTIPLY_SCALE_BITS
psllw xmm0, PRE_MULTIPLY_SCALE_BITS
pmulhw xmm0, [GOTOFF(ebx,PW_F0707)] ; xmm0=z3
movdqa xmm4, xmm7 ; xmm4=tmp10
psubw xmm7, xmm1
pmulhw xmm7, [GOTOFF(ebx,PW_F0382)] ; xmm7=z5
pmulhw xmm4, [GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
pmulhw xmm1, [GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
paddw xmm4, xmm7 ; xmm4=z2
paddw xmm1, xmm7 ; xmm1=z4
movdqa xmm3, xmm5
psubw xmm5, xmm0 ; xmm5=z13
paddw xmm3, xmm0 ; xmm3=z11
movdqa xmm6, xmm5
movdqa xmm2, xmm3
psubw xmm5, xmm4 ; xmm5=data3
psubw xmm3, xmm1 ; xmm3=data7
paddw xmm6, xmm4 ; xmm6=data5
paddw xmm2, xmm1 ; xmm2=data1
movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5
movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3
movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6
movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2
; pop edi ; unused
; pop esi ; unused
; pop edx ; need not be preserved
; pop ecx ; unused
poppic ebx
mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 32

@ -0,0 +1,331 @@
;
; jfdctint.asm - accurate integer FDCT (AVX2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
; This file contains a slower but more accurate integer implementation of the
; forward DCT (Discrete Cosine Transform). The following code is based
; directly on the IJG's original jfdctint.c; see the jfdctint.c for
; more details.
%include "jsimdext.inc"
%include "jdct.inc"
; --------------------------------------------------------------------------
%define CONST_BITS 13
%define PASS1_BITS 2
%define DESCALE_P1 (CONST_BITS - PASS1_BITS)
%define DESCALE_P2 (CONST_BITS + PASS1_BITS)
%if CONST_BITS == 13
F_0_298 equ 2446 ; FIX(0.298631336)
F_0_390 equ 3196 ; FIX(0.390180644)
F_0_541 equ 4433 ; FIX(0.541196100)
F_0_765 equ 6270 ; FIX(0.765366865)
F_0_899 equ 7373 ; FIX(0.899976223)
F_1_175 equ 9633 ; FIX(1.175875602)
F_1_501 equ 12299 ; FIX(1.501321110)
F_1_847 equ 15137 ; FIX(1.847759065)
F_1_961 equ 16069 ; FIX(1.961570560)
F_2_053 equ 16819 ; FIX(2.053119869)
F_2_562 equ 20995 ; FIX(2.562915447)
F_3_072 equ 25172 ; FIX(3.072711026)
%else
; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336)
F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644)
F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602)
F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110)
F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560)
F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869)
F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
%endif
; --------------------------------------------------------------------------
; In-place 8x8x16-bit matrix transpose using AVX2 instructions
; %1-%4: Input/output registers
; %5-%8: Temp registers
%macro dotranspose 8
; %1=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47)
; %2=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57)
; %3=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67)
; %4=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77)
vpunpcklwd %5, %1, %2
vpunpckhwd %6, %1, %2
vpunpcklwd %7, %3, %4
vpunpckhwd %8, %3, %4
; transpose coefficients(phase 1)
; %5=(00 10 01 11 02 12 03 13 40 50 41 51 42 52 43 53)
; %6=(04 14 05 15 06 16 07 17 44 54 45 55 46 56 47 57)
; %7=(20 30 21 31 22 32 23 33 60 70 61 71 62 72 63 73)
; %8=(24 34 25 35 26 36 27 37 64 74 65 75 66 76 67 77)
vpunpckldq %1, %5, %7
vpunpckhdq %2, %5, %7
vpunpckldq %3, %6, %8
vpunpckhdq %4, %6, %8
; transpose coefficients(phase 2)
; %1=(00 10 20 30 01 11 21 31 40 50 60 70 41 51 61 71)
; %2=(02 12 22 32 03 13 23 33 42 52 62 72 43 53 63 73)
; %3=(04 14 24 34 05 15 25 35 44 54 64 74 45 55 65 75)
; %4=(06 16 26 36 07 17 27 37 46 56 66 76 47 57 67 77)
vpermq %1, %1, 0x8D
vpermq %2, %2, 0x8D
vpermq %3, %3, 0xD8
vpermq %4, %4, 0xD8
; transpose coefficients(phase 3)
; %1=(01 11 21 31 41 51 61 71 00 10 20 30 40 50 60 70)
; %2=(03 13 23 33 43 53 63 73 02 12 22 32 42 52 62 72)
; %3=(04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75)
; %4=(06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77)
%endmacro
; --------------------------------------------------------------------------
; In-place 8x8x16-bit accurate integer forward DCT using AVX2 instructions
; %1-%4: Input/output registers
; %5-%8: Temp registers
; %9: Pass (1 or 2)
%macro dodct 9
vpsubw %5, %1, %4 ; %5=data1_0-data6_7=tmp6_7
vpaddw %6, %1, %4 ; %6=data1_0+data6_7=tmp1_0
vpaddw %7, %2, %3 ; %7=data3_2+data4_5=tmp3_2
vpsubw %8, %2, %3 ; %8=data3_2-data4_5=tmp4_5
; -- Even part
vperm2i128 %6, %6, %6, 0x01 ; %6=tmp0_1
vpaddw %1, %6, %7 ; %1=tmp0_1+tmp3_2=tmp10_11
vpsubw %6, %6, %7 ; %6=tmp0_1-tmp3_2=tmp13_12
vperm2i128 %7, %1, %1, 0x01 ; %7=tmp11_10
vpsignw %1, %1, [GOTOFF(ebx, PW_1_NEG1)] ; %1=tmp10_neg11
vpaddw %7, %7, %1 ; %7=(tmp10+tmp11)_(tmp10-tmp11)
%if %9 == 1
vpsllw %1, %7, PASS1_BITS ; %1=data0_4
%else
vpaddw %7, %7, [GOTOFF(ebx, PW_DESCALE_P2X)]
vpsraw %1, %7, PASS1_BITS ; %1=data0_4
%endif
; (Original)
; z1 = (tmp12 + tmp13) * 0.541196100;
; data2 = z1 + tmp13 * 0.765366865;
; data6 = z1 + tmp12 * -1.847759065;
;
; (This implementation)
; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
vperm2i128 %7, %6, %6, 0x01 ; %7=tmp12_13
vpunpcklwd %2, %6, %7
vpunpckhwd %6, %6, %7
vpmaddwd %2, %2, [GOTOFF(ebx, PW_F130_F054_MF130_F054)] ; %2=data2_6L
vpmaddwd %6, %6, [GOTOFF(ebx, PW_F130_F054_MF130_F054)] ; %6=data2_6H
vpaddd %2, %2, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
vpaddd %6, %6, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
vpsrad %2, %2, DESCALE_P %+ %9
vpsrad %6, %6, DESCALE_P %+ %9
vpackssdw %3, %2, %6 ; %6=data2_6
; -- Odd part
vpaddw %7, %8, %5 ; %7=tmp4_5+tmp6_7=z3_4
; (Original)
; z5 = (z3 + z4) * 1.175875602;
; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
; z3 += z5; z4 += z5;
;
; (This implementation)
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
vperm2i128 %2, %7, %7, 0x01 ; %2=z4_3
vpunpcklwd %6, %7, %2
vpunpckhwd %7, %7, %2
vpmaddwd %6, %6, [GOTOFF(ebx, PW_MF078_F117_F078_F117)] ; %6=z3_4L
vpmaddwd %7, %7, [GOTOFF(ebx, PW_MF078_F117_F078_F117)] ; %7=z3_4H
; (Original)
; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
;
; (This implementation)
; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
; data7 = tmp4 + z3; data5 = tmp5 + z4;
; data3 = tmp6 + z3; data1 = tmp7 + z4;
vperm2i128 %4, %5, %5, 0x01 ; %4=tmp7_6
vpunpcklwd %2, %8, %4
vpunpckhwd %4, %8, %4
vpmaddwd %2, %2, [GOTOFF(ebx, PW_MF060_MF089_MF050_MF256)] ; %2=tmp4_5L
vpmaddwd %4, %4, [GOTOFF(ebx, PW_MF060_MF089_MF050_MF256)] ; %4=tmp4_5H
vpaddd %2, %2, %6 ; %2=data7_5L
vpaddd %4, %4, %7 ; %4=data7_5H
vpaddd %2, %2, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
vpaddd %4, %4, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
vpsrad %2, %2, DESCALE_P %+ %9
vpsrad %4, %4, DESCALE_P %+ %9
vpackssdw %4, %2, %4 ; %4=data7_5
vperm2i128 %2, %8, %8, 0x01 ; %2=tmp5_4
vpunpcklwd %8, %5, %2
vpunpckhwd %5, %5, %2
vpmaddwd %8, %8, [GOTOFF(ebx, PW_F050_MF256_F060_MF089)] ; %8=tmp6_7L
vpmaddwd %5, %5, [GOTOFF(ebx, PW_F050_MF256_F060_MF089)] ; %5=tmp6_7H
vpaddd %8, %8, %6 ; %8=data3_1L
vpaddd %5, %5, %7 ; %5=data3_1H
vpaddd %8, %8, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
vpaddd %5, %5, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
vpsrad %8, %8, DESCALE_P %+ %9
vpsrad %5, %5, DESCALE_P %+ %9
vpackssdw %2, %8, %5 ; %2=data3_1
%endmacro
; --------------------------------------------------------------------------
SECTION SEG_CONST
alignz 32
GLOBAL_DATA(jconst_fdct_islow_avx2)
EXTN(jconst_fdct_islow_avx2):
PW_F130_F054_MF130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541
times 4 dw (F_0_541 - F_1_847), F_0_541
PW_MF078_F117_F078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175
times 4 dw (F_1_175 - F_0_390), F_1_175
PW_MF060_MF089_MF050_MF256 times 4 dw (F_0_298 - F_0_899), -F_0_899
times 4 dw (F_2_053 - F_2_562), -F_2_562
PW_F050_MF256_F060_MF089 times 4 dw (F_3_072 - F_2_562), -F_2_562
times 4 dw (F_1_501 - F_0_899), -F_0_899
PD_DESCALE_P1 times 8 dd 1 << (DESCALE_P1 - 1)
PD_DESCALE_P2 times 8 dd 1 << (DESCALE_P2 - 1)
PW_DESCALE_P2X times 16 dw 1 << (PASS1_BITS - 1)
PW_1_NEG1 times 8 dw 1
times 8 dw -1
alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
;
; Perform the forward DCT on one block of samples.
;
; GLOBAL(void)
; jsimd_fdct_islow_avx2(DCTELEM *data)
;
%define data(b) (b) + 8 ; DCTELEM *data
align 32
GLOBAL_FUNCTION(jsimd_fdct_islow_avx2)
EXTN(jsimd_fdct_islow_avx2):
push ebp
mov ebp, esp
pushpic ebx
; push ecx ; unused
; push edx ; need not be preserved
; push esi ; unused
; push edi ; unused
get_GOT ebx ; get GOT address
; ---- Pass 1: process rows.
mov edx, POINTER [data(ebp)] ; (DCTELEM *)
vmovdqu ymm4, YMMWORD [YMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
vmovdqu ymm5, YMMWORD [YMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
vmovdqu ymm6, YMMWORD [YMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
vmovdqu ymm7, YMMWORD [YMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
; ymm4=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
; ymm5=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
; ymm6=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
; ymm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
vperm2i128 ymm0, ymm4, ymm6, 0x20
vperm2i128 ymm1, ymm4, ymm6, 0x31
vperm2i128 ymm2, ymm5, ymm7, 0x20
vperm2i128 ymm3, ymm5, ymm7, 0x31
; ymm0=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47)
; ymm1=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57)
; ymm2=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67)
; ymm3=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77)
dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, 1
; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm3=data7_5
; ---- Pass 2: process columns.
vperm2i128 ymm4, ymm1, ymm3, 0x20 ; ymm4=data3_7
vperm2i128 ymm1, ymm1, ymm3, 0x31 ; ymm1=data1_5
dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, 2
; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm4=data7_5
vperm2i128 ymm3, ymm0, ymm1, 0x30 ; ymm3=data0_1
vperm2i128 ymm5, ymm2, ymm1, 0x20 ; ymm5=data2_3
vperm2i128 ymm6, ymm0, ymm4, 0x31 ; ymm6=data4_5
vperm2i128 ymm7, ymm2, ymm4, 0x21 ; ymm7=data6_7
vmovdqu YMMWORD [YMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], ymm3
vmovdqu YMMWORD [YMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], ymm5
vmovdqu YMMWORD [YMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], ymm6
vmovdqu YMMWORD [YMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], ymm7
vzeroupper
; pop edi ; unused
; pop esi ; unused
; pop edx ; need not be preserved
; pop ecx ; unused
poppic ebx
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 32

@ -0,0 +1,620 @@
;
; jfdctint.asm - accurate integer FDCT (MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, 2020, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
; This file contains a slower but more accurate integer implementation of the
; forward DCT (Discrete Cosine Transform). The following code is based
; directly on the IJG's original jfdctint.c; see the jfdctint.c for
; more details.
%include "jsimdext.inc"
%include "jdct.inc"
; --------------------------------------------------------------------------
%define CONST_BITS 13
%define PASS1_BITS 2
%define DESCALE_P1 (CONST_BITS - PASS1_BITS)
%define DESCALE_P2 (CONST_BITS + PASS1_BITS)
%if CONST_BITS == 13
F_0_298 equ 2446 ; FIX(0.298631336)
F_0_390 equ 3196 ; FIX(0.390180644)
F_0_541 equ 4433 ; FIX(0.541196100)
F_0_765 equ 6270 ; FIX(0.765366865)
F_0_899 equ 7373 ; FIX(0.899976223)
F_1_175 equ 9633 ; FIX(1.175875602)
F_1_501 equ 12299 ; FIX(1.501321110)
F_1_847 equ 15137 ; FIX(1.847759065)
F_1_961 equ 16069 ; FIX(1.961570560)
F_2_053 equ 16819 ; FIX(2.053119869)
F_2_562 equ 20995 ; FIX(2.562915447)
F_3_072 equ 25172 ; FIX(3.072711026)
%else
; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336)
F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644)
F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602)
F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110)
F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560)
F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869)
F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
%endif
; --------------------------------------------------------------------------
SECTION SEG_CONST
alignz 32
GLOBAL_DATA(jconst_fdct_islow_mmx)
EXTN(jconst_fdct_islow_mmx):
PW_F130_F054 times 2 dw (F_0_541 + F_0_765), F_0_541
PW_F054_MF130 times 2 dw F_0_541, (F_0_541 - F_1_847)
PW_MF078_F117 times 2 dw (F_1_175 - F_1_961), F_1_175
PW_F117_F078 times 2 dw F_1_175, (F_1_175 - F_0_390)
PW_MF060_MF089 times 2 dw (F_0_298 - F_0_899), -F_0_899
PW_MF089_F060 times 2 dw -F_0_899, (F_1_501 - F_0_899)
PW_MF050_MF256 times 2 dw (F_2_053 - F_2_562), -F_2_562
PW_MF256_F050 times 2 dw -F_2_562, (F_3_072 - F_2_562)
PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1 - 1)
PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2 - 1)
PW_DESCALE_P2X times 4 dw 1 << (PASS1_BITS - 1)
alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
;
; Perform the forward DCT on one block of samples.
;
; GLOBAL(void)
; jsimd_fdct_islow_mmx(DCTELEM *data)
;
%define data(b) (b) + 8 ; DCTELEM *data
%define original_ebp ebp + 0
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM]
%define WK_NUM 2
align 32
GLOBAL_FUNCTION(jsimd_fdct_islow_mmx)
EXTN(jsimd_fdct_islow_mmx):
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
; push esi ; unused
; push edi ; unused
get_GOT ebx ; get GOT address
; ---- Pass 1: process rows.
mov edx, POINTER [data(eax)] ; (DCTELEM *)
mov ecx, DCTSIZE/4
alignx 16, 7
.rowloop:
movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
; mm0=(20 21 22 23), mm2=(24 25 26 27)
; mm1=(30 31 32 33), mm3=(34 35 36 37)
movq mm4, mm0 ; transpose coefficients(phase 1)
punpcklwd mm0, mm1 ; mm0=(20 30 21 31)
punpckhwd mm4, mm1 ; mm4=(22 32 23 33)
movq mm5, mm2 ; transpose coefficients(phase 1)
punpcklwd mm2, mm3 ; mm2=(24 34 25 35)
punpckhwd mm5, mm3 ; mm5=(26 36 27 37)
movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
; mm6=(00 01 02 03), mm1=(04 05 06 07)
; mm7=(10 11 12 13), mm3=(14 15 16 17)
movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33)
movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35)
movq mm4, mm6 ; transpose coefficients(phase 1)
punpcklwd mm6, mm7 ; mm6=(00 10 01 11)
punpckhwd mm4, mm7 ; mm4=(02 12 03 13)
movq mm2, mm1 ; transpose coefficients(phase 1)
punpcklwd mm1, mm3 ; mm1=(04 14 05 15)
punpckhwd mm2, mm3 ; mm2=(06 16 07 17)
movq mm7, mm6 ; transpose coefficients(phase 2)
punpckldq mm6, mm0 ; mm6=(00 10 20 30)=data0
punpckhdq mm7, mm0 ; mm7=(01 11 21 31)=data1
movq mm3, mm2 ; transpose coefficients(phase 2)
punpckldq mm2, mm5 ; mm2=(06 16 26 36)=data6
punpckhdq mm3, mm5 ; mm3=(07 17 27 37)=data7
movq mm0, mm7
movq mm5, mm6
psubw mm7, mm2 ; mm7=data1-data6=tmp6
psubw mm6, mm3 ; mm6=data0-data7=tmp7
paddw mm0, mm2 ; mm0=data1+data6=tmp1
paddw mm5, mm3 ; mm5=data0+data7=tmp0
movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33)
movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35)
movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6
movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7
movq mm7, mm4 ; transpose coefficients(phase 2)
punpckldq mm4, mm2 ; mm4=(02 12 22 32)=data2
punpckhdq mm7, mm2 ; mm7=(03 13 23 33)=data3
movq mm6, mm1 ; transpose coefficients(phase 2)
punpckldq mm1, mm3 ; mm1=(04 14 24 34)=data4
punpckhdq mm6, mm3 ; mm6=(05 15 25 35)=data5
movq mm2, mm7
movq mm3, mm4
paddw mm7, mm1 ; mm7=data3+data4=tmp3
paddw mm4, mm6 ; mm4=data2+data5=tmp2
psubw mm2, mm1 ; mm2=data3-data4=tmp4
psubw mm3, mm6 ; mm3=data2-data5=tmp5
; -- Even part
movq mm1, mm5
movq mm6, mm0
paddw mm5, mm7 ; mm5=tmp10
paddw mm0, mm4 ; mm0=tmp11
psubw mm1, mm7 ; mm1=tmp13
psubw mm6, mm4 ; mm6=tmp12
movq mm7, mm5
paddw mm5, mm0 ; mm5=tmp10+tmp11
psubw mm7, mm0 ; mm7=tmp10-tmp11
psllw mm5, PASS1_BITS ; mm5=data0
psllw mm7, PASS1_BITS ; mm7=data4
movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7
; (Original)
; z1 = (tmp12 + tmp13) * 0.541196100;
; data2 = z1 + tmp13 * 0.765366865;
; data6 = z1 + tmp12 * -1.847759065;
;
; (This implementation)
; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
movq mm4, mm1 ; mm1=tmp13
movq mm0, mm1
punpcklwd mm4, mm6 ; mm6=tmp12
punpckhwd mm0, mm6
movq mm1, mm4
movq mm6, mm0
pmaddwd mm4, [GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L
pmaddwd mm0, [GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H
pmaddwd mm1, [GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L
pmaddwd mm6, [GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H
paddd mm4, [GOTOFF(ebx,PD_DESCALE_P1)]
paddd mm0, [GOTOFF(ebx,PD_DESCALE_P1)]
psrad mm4, DESCALE_P1
psrad mm0, DESCALE_P1
paddd mm1, [GOTOFF(ebx,PD_DESCALE_P1)]
paddd mm6, [GOTOFF(ebx,PD_DESCALE_P1)]
psrad mm1, DESCALE_P1
psrad mm6, DESCALE_P1
packssdw mm4, mm0 ; mm4=data2
packssdw mm1, mm6 ; mm1=data6
movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1
; -- Odd part
movq mm5, MMWORD [wk(0)] ; mm5=tmp6
movq mm7, MMWORD [wk(1)] ; mm7=tmp7
movq mm0, mm2 ; mm2=tmp4
movq mm6, mm3 ; mm3=tmp5
paddw mm0, mm5 ; mm0=z3
paddw mm6, mm7 ; mm6=z4
; (Original)
; z5 = (z3 + z4) * 1.175875602;
; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
; z3 += z5; z4 += z5;
;
; (This implementation)
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
movq mm4, mm0
movq mm1, mm0
punpcklwd mm4, mm6
punpckhwd mm1, mm6
movq mm0, mm4
movq mm6, mm1
pmaddwd mm4, [GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L
pmaddwd mm1, [GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H
pmaddwd mm0, [GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L
pmaddwd mm6, [GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H
movq MMWORD [wk(0)], mm4 ; wk(0)=z3L
movq MMWORD [wk(1)], mm1 ; wk(1)=z3H
; (Original)
; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
;
; (This implementation)
; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
; data7 = tmp4 + z3; data5 = tmp5 + z4;
; data3 = tmp6 + z3; data1 = tmp7 + z4;
movq mm4, mm2
movq mm1, mm2
punpcklwd mm4, mm7
punpckhwd mm1, mm7
movq mm2, mm4
movq mm7, mm1
pmaddwd mm4, [GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L
pmaddwd mm1, [GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H
pmaddwd mm2, [GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L
pmaddwd mm7, [GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H
paddd mm4, MMWORD [wk(0)] ; mm4=data7L
paddd mm1, MMWORD [wk(1)] ; mm1=data7H
paddd mm2, mm0 ; mm2=data1L
paddd mm7, mm6 ; mm7=data1H
paddd mm4, [GOTOFF(ebx,PD_DESCALE_P1)]
paddd mm1, [GOTOFF(ebx,PD_DESCALE_P1)]
psrad mm4, DESCALE_P1
psrad mm1, DESCALE_P1
paddd mm2, [GOTOFF(ebx,PD_DESCALE_P1)]
paddd mm7, [GOTOFF(ebx,PD_DESCALE_P1)]
psrad mm2, DESCALE_P1
psrad mm7, DESCALE_P1
packssdw mm4, mm1 ; mm4=data7
packssdw mm2, mm7 ; mm2=data1
movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4
movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
movq mm1, mm3
movq mm7, mm3
punpcklwd mm1, mm5
punpckhwd mm7, mm5
movq mm3, mm1
movq mm5, mm7
pmaddwd mm1, [GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L
pmaddwd mm7, [GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H
pmaddwd mm3, [GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L
pmaddwd mm5, [GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H
paddd mm1, mm0 ; mm1=data5L
paddd mm7, mm6 ; mm7=data5H
paddd mm3, MMWORD [wk(0)] ; mm3=data3L
paddd mm5, MMWORD [wk(1)] ; mm5=data3H
paddd mm1, [GOTOFF(ebx,PD_DESCALE_P1)]
paddd mm7, [GOTOFF(ebx,PD_DESCALE_P1)]
psrad mm1, DESCALE_P1
psrad mm7, DESCALE_P1
paddd mm3, [GOTOFF(ebx,PD_DESCALE_P1)]
paddd mm5, [GOTOFF(ebx,PD_DESCALE_P1)]
psrad mm3, DESCALE_P1
psrad mm5, DESCALE_P1
packssdw mm1, mm7 ; mm1=data5
packssdw mm3, mm5 ; mm3=data3
movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1
movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
dec ecx
jnz near .rowloop
; ---- Pass 2: process columns.
mov edx, POINTER [data(eax)] ; (DCTELEM *)
mov ecx, DCTSIZE/4
alignx 16, 7
.columnloop:
movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
; mm0=(02 12 22 32), mm2=(42 52 62 72)
; mm1=(03 13 23 33), mm3=(43 53 63 73)
movq mm4, mm0 ; transpose coefficients(phase 1)
punpcklwd mm0, mm1 ; mm0=(02 03 12 13)
punpckhwd mm4, mm1 ; mm4=(22 23 32 33)
movq mm5, mm2 ; transpose coefficients(phase 1)
punpcklwd mm2, mm3 ; mm2=(42 43 52 53)
punpckhwd mm5, mm3 ; mm5=(62 63 72 73)
movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
; mm6=(00 10 20 30), mm1=(40 50 60 70)
; mm7=(01 11 21 31), mm3=(41 51 61 71)
movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33)
movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53)
movq mm4, mm6 ; transpose coefficients(phase 1)
punpcklwd mm6, mm7 ; mm6=(00 01 10 11)
punpckhwd mm4, mm7 ; mm4=(20 21 30 31)
movq mm2, mm1 ; transpose coefficients(phase 1)
punpcklwd mm1, mm3 ; mm1=(40 41 50 51)
punpckhwd mm2, mm3 ; mm2=(60 61 70 71)
movq mm7, mm6 ; transpose coefficients(phase 2)
punpckldq mm6, mm0 ; mm6=(00 01 02 03)=data0
punpckhdq mm7, mm0 ; mm7=(10 11 12 13)=data1
movq mm3, mm2 ; transpose coefficients(phase 2)
punpckldq mm2, mm5 ; mm2=(60 61 62 63)=data6
punpckhdq mm3, mm5 ; mm3=(70 71 72 73)=data7
movq mm0, mm7
movq mm5, mm6
psubw mm7, mm2 ; mm7=data1-data6=tmp6
psubw mm6, mm3 ; mm6=data0-data7=tmp7
paddw mm0, mm2 ; mm0=data1+data6=tmp1
paddw mm5, mm3 ; mm5=data0+data7=tmp0
movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33)
movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53)
movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6
movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7
movq mm7, mm4 ; transpose coefficients(phase 2)
punpckldq mm4, mm2 ; mm4=(20 21 22 23)=data2
punpckhdq mm7, mm2 ; mm7=(30 31 32 33)=data3
movq mm6, mm1 ; transpose coefficients(phase 2)
punpckldq mm1, mm3 ; mm1=(40 41 42 43)=data4
punpckhdq mm6, mm3 ; mm6=(50 51 52 53)=data5
movq mm2, mm7
movq mm3, mm4
paddw mm7, mm1 ; mm7=data3+data4=tmp3
paddw mm4, mm6 ; mm4=data2+data5=tmp2
psubw mm2, mm1 ; mm2=data3-data4=tmp4
psubw mm3, mm6 ; mm3=data2-data5=tmp5
; -- Even part
movq mm1, mm5
movq mm6, mm0
paddw mm5, mm7 ; mm5=tmp10
paddw mm0, mm4 ; mm0=tmp11
psubw mm1, mm7 ; mm1=tmp13
psubw mm6, mm4 ; mm6=tmp12
movq mm7, mm5
paddw mm5, mm0 ; mm5=tmp10+tmp11
psubw mm7, mm0 ; mm7=tmp10-tmp11
paddw mm5, [GOTOFF(ebx,PW_DESCALE_P2X)]
paddw mm7, [GOTOFF(ebx,PW_DESCALE_P2X)]
psraw mm5, PASS1_BITS ; mm5=data0
psraw mm7, PASS1_BITS ; mm7=data4
movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7
; (Original)
; z1 = (tmp12 + tmp13) * 0.541196100;
; data2 = z1 + tmp13 * 0.765366865;
; data6 = z1 + tmp12 * -1.847759065;
;
; (This implementation)
; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
movq mm4, mm1 ; mm1=tmp13
movq mm0, mm1
punpcklwd mm4, mm6 ; mm6=tmp12
punpckhwd mm0, mm6
movq mm1, mm4
movq mm6, mm0
pmaddwd mm4, [GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L
pmaddwd mm0, [GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H
pmaddwd mm1, [GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L
pmaddwd mm6, [GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H
paddd mm4, [GOTOFF(ebx,PD_DESCALE_P2)]
paddd mm0, [GOTOFF(ebx,PD_DESCALE_P2)]
psrad mm4, DESCALE_P2
psrad mm0, DESCALE_P2
paddd mm1, [GOTOFF(ebx,PD_DESCALE_P2)]
paddd mm6, [GOTOFF(ebx,PD_DESCALE_P2)]
psrad mm1, DESCALE_P2
psrad mm6, DESCALE_P2
packssdw mm4, mm0 ; mm4=data2
packssdw mm1, mm6 ; mm1=data6
movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1
; -- Odd part
movq mm5, MMWORD [wk(0)] ; mm5=tmp6
movq mm7, MMWORD [wk(1)] ; mm7=tmp7
movq mm0, mm2 ; mm2=tmp4
movq mm6, mm3 ; mm3=tmp5
paddw mm0, mm5 ; mm0=z3
paddw mm6, mm7 ; mm6=z4
; (Original)
; z5 = (z3 + z4) * 1.175875602;
; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
; z3 += z5; z4 += z5;
;
; (This implementation)
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
movq mm4, mm0
movq mm1, mm0
punpcklwd mm4, mm6
punpckhwd mm1, mm6
movq mm0, mm4
movq mm6, mm1
pmaddwd mm4, [GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L
pmaddwd mm1, [GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H
pmaddwd mm0, [GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L
pmaddwd mm6, [GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H
movq MMWORD [wk(0)], mm4 ; wk(0)=z3L
movq MMWORD [wk(1)], mm1 ; wk(1)=z3H
; (Original)
; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
;
; (This implementation)
; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
; data7 = tmp4 + z3; data5 = tmp5 + z4;
; data3 = tmp6 + z3; data1 = tmp7 + z4;
movq mm4, mm2
movq mm1, mm2
punpcklwd mm4, mm7
punpckhwd mm1, mm7
movq mm2, mm4
movq mm7, mm1
pmaddwd mm4, [GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L
pmaddwd mm1, [GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H
pmaddwd mm2, [GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L
pmaddwd mm7, [GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H
paddd mm4, MMWORD [wk(0)] ; mm4=data7L
paddd mm1, MMWORD [wk(1)] ; mm1=data7H
paddd mm2, mm0 ; mm2=data1L
paddd mm7, mm6 ; mm7=data1H
paddd mm4, [GOTOFF(ebx,PD_DESCALE_P2)]
paddd mm1, [GOTOFF(ebx,PD_DESCALE_P2)]
psrad mm4, DESCALE_P2
psrad mm1, DESCALE_P2
paddd mm2, [GOTOFF(ebx,PD_DESCALE_P2)]
paddd mm7, [GOTOFF(ebx,PD_DESCALE_P2)]
psrad mm2, DESCALE_P2
psrad mm7, DESCALE_P2
packssdw mm4, mm1 ; mm4=data7
packssdw mm2, mm7 ; mm2=data1
movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4
movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
movq mm1, mm3
movq mm7, mm3
punpcklwd mm1, mm5
punpckhwd mm7, mm5
movq mm3, mm1
movq mm5, mm7
pmaddwd mm1, [GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L
pmaddwd mm7, [GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H
pmaddwd mm3, [GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L
pmaddwd mm5, [GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H
paddd mm1, mm0 ; mm1=data5L
paddd mm7, mm6 ; mm7=data5H
paddd mm3, MMWORD [wk(0)] ; mm3=data3L
paddd mm5, MMWORD [wk(1)] ; mm5=data3H
paddd mm1, [GOTOFF(ebx,PD_DESCALE_P2)]
paddd mm7, [GOTOFF(ebx,PD_DESCALE_P2)]
psrad mm1, DESCALE_P2
psrad mm7, DESCALE_P2
paddd mm3, [GOTOFF(ebx,PD_DESCALE_P2)]
paddd mm5, [GOTOFF(ebx,PD_DESCALE_P2)]
psrad mm3, DESCALE_P2
psrad mm5, DESCALE_P2
packssdw mm1, mm7 ; mm1=data5
packssdw mm3, mm5 ; mm3=data3
movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1
movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
add edx, byte 4*SIZEOF_DCTELEM
dec ecx
jnz near .columnloop
emms ; empty MMX state
; pop edi ; unused
; pop esi ; unused
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
poppic ebx
mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 32

@ -0,0 +1,633 @@
;
; jfdctint.asm - accurate integer FDCT (SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, 2020, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
; This file contains a slower but more accurate integer implementation of the
; forward DCT (Discrete Cosine Transform). The following code is based
; directly on the IJG's original jfdctint.c; see the jfdctint.c for
; more details.
%include "jsimdext.inc"
%include "jdct.inc"
; --------------------------------------------------------------------------
%define CONST_BITS 13
%define PASS1_BITS 2
%define DESCALE_P1 (CONST_BITS - PASS1_BITS)
%define DESCALE_P2 (CONST_BITS + PASS1_BITS)
%if CONST_BITS == 13
F_0_298 equ 2446 ; FIX(0.298631336)
F_0_390 equ 3196 ; FIX(0.390180644)
F_0_541 equ 4433 ; FIX(0.541196100)
F_0_765 equ 6270 ; FIX(0.765366865)
F_0_899 equ 7373 ; FIX(0.899976223)
F_1_175 equ 9633 ; FIX(1.175875602)
F_1_501 equ 12299 ; FIX(1.501321110)
F_1_847 equ 15137 ; FIX(1.847759065)
F_1_961 equ 16069 ; FIX(1.961570560)
F_2_053 equ 16819 ; FIX(2.053119869)
F_2_562 equ 20995 ; FIX(2.562915447)
F_3_072 equ 25172 ; FIX(3.072711026)
%else
; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336)
F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644)
F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602)
F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110)
F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560)
F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869)
F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
%endif
; --------------------------------------------------------------------------
SECTION SEG_CONST
alignz 32
GLOBAL_DATA(jconst_fdct_islow_sse2)
EXTN(jconst_fdct_islow_sse2):
PW_F130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541
PW_F054_MF130 times 4 dw F_0_541, (F_0_541 - F_1_847)
PW_MF078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175
PW_F117_F078 times 4 dw F_1_175, (F_1_175 - F_0_390)
PW_MF060_MF089 times 4 dw (F_0_298 - F_0_899), -F_0_899
PW_MF089_F060 times 4 dw -F_0_899, (F_1_501 - F_0_899)
PW_MF050_MF256 times 4 dw (F_2_053 - F_2_562), -F_2_562
PW_MF256_F050 times 4 dw -F_2_562, (F_3_072 - F_2_562)
PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1 - 1)
PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1)
PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS - 1)
alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
;
; Perform the forward DCT on one block of samples.
;
; GLOBAL(void)
; jsimd_fdct_islow_sse2(DCTELEM *data)
;
%define data(b) (b) + 8 ; DCTELEM *data
%define original_ebp ebp + 0
%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
; xmmword wk[WK_NUM]
%define WK_NUM 6
align 32
GLOBAL_FUNCTION(jsimd_fdct_islow_sse2)
EXTN(jsimd_fdct_islow_sse2):
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)]
pushpic ebx
; push ecx ; unused
; push edx ; need not be preserved
; push esi ; unused
; push edi ; unused
get_GOT ebx ; get GOT address
; ---- Pass 1: process rows.
mov edx, POINTER [data(eax)] ; (DCTELEM *)
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
movdqa xmm4, xmm0 ; transpose coefficients(phase 1)
punpcklwd xmm0, xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
punpckhwd xmm4, xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
movdqa xmm5, xmm2 ; transpose coefficients(phase 1)
punpcklwd xmm2, xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
punpckhwd xmm5, xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
punpcklwd xmm6, xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
punpckhwd xmm2, xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
movdqa xmm5, xmm1 ; transpose coefficients(phase 1)
punpcklwd xmm1, xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
punpckhwd xmm5, xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
movdqa xmm7, xmm6 ; transpose coefficients(phase 2)
punpckldq xmm6, xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
punpckhdq xmm7, xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
movdqa xmm3, xmm2 ; transpose coefficients(phase 2)
punpckldq xmm2, xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
punpckhdq xmm3, xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73)
movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75)
movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
punpckldq xmm0, xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
punpckhdq xmm7, xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
movdqa xmm2, xmm4 ; transpose coefficients(phase 2)
punpckldq xmm4, xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
punpckhdq xmm2, xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
movdqa xmm1, xmm0 ; transpose coefficients(phase 3)
punpcklqdq xmm0, xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
punpckhqdq xmm1, xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
movdqa xmm5, xmm2 ; transpose coefficients(phase 3)
punpcklqdq xmm2, xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
punpckhqdq xmm5, xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
movdqa xmm6, xmm1
movdqa xmm3, xmm0
psubw xmm1, xmm2 ; xmm1=data1-data6=tmp6
psubw xmm0, xmm5 ; xmm0=data0-data7=tmp7
paddw xmm6, xmm2 ; xmm6=data1+data6=tmp1
paddw xmm3, xmm5 ; xmm3=data0+data7=tmp0
movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73)
movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75)
movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
punpcklqdq xmm7, xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
punpckhqdq xmm1, xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
movdqa xmm0, xmm4 ; transpose coefficients(phase 3)
punpcklqdq xmm4, xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
punpckhqdq xmm0, xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
movdqa xmm2, xmm1
movdqa xmm5, xmm7
paddw xmm1, xmm4 ; xmm1=data3+data4=tmp3
paddw xmm7, xmm0 ; xmm7=data2+data5=tmp2
psubw xmm2, xmm4 ; xmm2=data3-data4=tmp4
psubw xmm5, xmm0 ; xmm5=data2-data5=tmp5
; -- Even part
movdqa xmm4, xmm3
movdqa xmm0, xmm6
paddw xmm3, xmm1 ; xmm3=tmp10
paddw xmm6, xmm7 ; xmm6=tmp11
psubw xmm4, xmm1 ; xmm4=tmp13
psubw xmm0, xmm7 ; xmm0=tmp12
movdqa xmm1, xmm3
paddw xmm3, xmm6 ; xmm3=tmp10+tmp11
psubw xmm1, xmm6 ; xmm1=tmp10-tmp11
psllw xmm3, PASS1_BITS ; xmm3=data0
psllw xmm1, PASS1_BITS ; xmm1=data4
movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0
movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4
; (Original)
; z1 = (tmp12 + tmp13) * 0.541196100;
; data2 = z1 + tmp13 * 0.765366865;
; data6 = z1 + tmp12 * -1.847759065;
;
; (This implementation)
; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
movdqa xmm7, xmm4 ; xmm4=tmp13
movdqa xmm6, xmm4
punpcklwd xmm7, xmm0 ; xmm0=tmp12
punpckhwd xmm6, xmm0
movdqa xmm4, xmm7
movdqa xmm0, xmm6
pmaddwd xmm7, [GOTOFF(ebx,PW_F130_F054)] ; xmm7=data2L
pmaddwd xmm6, [GOTOFF(ebx,PW_F130_F054)] ; xmm6=data2H
pmaddwd xmm4, [GOTOFF(ebx,PW_F054_MF130)] ; xmm4=data6L
pmaddwd xmm0, [GOTOFF(ebx,PW_F054_MF130)] ; xmm0=data6H
paddd xmm7, [GOTOFF(ebx,PD_DESCALE_P1)]
paddd xmm6, [GOTOFF(ebx,PD_DESCALE_P1)]
psrad xmm7, DESCALE_P1
psrad xmm6, DESCALE_P1
paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
paddd xmm0, [GOTOFF(ebx,PD_DESCALE_P1)]
psrad xmm4, DESCALE_P1
psrad xmm0, DESCALE_P1
packssdw xmm7, xmm6 ; xmm7=data2
packssdw xmm4, xmm0 ; xmm4=data6
movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2
movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6
; -- Odd part
movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7
movdqa xmm6, xmm2 ; xmm2=tmp4
movdqa xmm0, xmm5 ; xmm5=tmp5
paddw xmm6, xmm3 ; xmm6=z3
paddw xmm0, xmm1 ; xmm0=z4
; (Original)
; z5 = (z3 + z4) * 1.175875602;
; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
; z3 += z5; z4 += z5;
;
; (This implementation)
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
movdqa xmm7, xmm6
movdqa xmm4, xmm6
punpcklwd xmm7, xmm0
punpckhwd xmm4, xmm0
movdqa xmm6, xmm7
movdqa xmm0, xmm4
pmaddwd xmm7, [GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3L
pmaddwd xmm4, [GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3H
pmaddwd xmm6, [GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4L
pmaddwd xmm0, [GOTOFF(ebx,PW_F117_F078)] ; xmm0=z4H
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L
movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H
; (Original)
; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
;
; (This implementation)
; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
; data7 = tmp4 + z3; data5 = tmp5 + z4;
; data3 = tmp6 + z3; data1 = tmp7 + z4;
movdqa xmm7, xmm2
movdqa xmm4, xmm2
punpcklwd xmm7, xmm1
punpckhwd xmm4, xmm1
movdqa xmm2, xmm7
movdqa xmm1, xmm4
pmaddwd xmm7, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp4L
pmaddwd xmm4, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4H
pmaddwd xmm2, [GOTOFF(ebx,PW_MF089_F060)] ; xmm2=tmp7L
pmaddwd xmm1, [GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp7H
paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L
paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H
paddd xmm2, xmm6 ; xmm2=data1L
paddd xmm1, xmm0 ; xmm1=data1H
paddd xmm7, [GOTOFF(ebx,PD_DESCALE_P1)]
paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
psrad xmm7, DESCALE_P1
psrad xmm4, DESCALE_P1
paddd xmm2, [GOTOFF(ebx,PD_DESCALE_P1)]
paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P1)]
psrad xmm2, DESCALE_P1
psrad xmm1, DESCALE_P1
packssdw xmm7, xmm4 ; xmm7=data7
packssdw xmm2, xmm1 ; xmm2=data1
movdqa xmm4, xmm5
movdqa xmm1, xmm5
punpcklwd xmm4, xmm3
punpckhwd xmm1, xmm3
movdqa xmm5, xmm4
movdqa xmm3, xmm1
pmaddwd xmm4, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm4=tmp5L
pmaddwd xmm1, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5H
pmaddwd xmm5, [GOTOFF(ebx,PW_MF256_F050)] ; xmm5=tmp6L
pmaddwd xmm3, [GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6H
paddd xmm4, xmm6 ; xmm4=data5L
paddd xmm1, xmm0 ; xmm1=data5H
paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L
paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H
paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P1)]
psrad xmm4, DESCALE_P1
psrad xmm1, DESCALE_P1
paddd xmm5, [GOTOFF(ebx,PD_DESCALE_P1)]
paddd xmm3, [GOTOFF(ebx,PD_DESCALE_P1)]
psrad xmm5, DESCALE_P1
psrad xmm3, DESCALE_P1
packssdw xmm4, xmm1 ; xmm4=data5
packssdw xmm5, xmm3 ; xmm5=data3
; ---- Pass 2: process columns.
; mov edx, POINTER [data(eax)] ; (DCTELEM *)
movdqa xmm6, XMMWORD [wk(2)] ; xmm6=col0
movdqa xmm0, XMMWORD [wk(4)] ; xmm0=col2
; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
movdqa xmm1, xmm6 ; transpose coefficients(phase 1)
punpcklwd xmm6, xmm2 ; xmm6=(00 01 10 11 20 21 30 31)
punpckhwd xmm1, xmm2 ; xmm1=(40 41 50 51 60 61 70 71)
movdqa xmm3, xmm0 ; transpose coefficients(phase 1)
punpcklwd xmm0, xmm5 ; xmm0=(02 03 12 13 22 23 32 33)
punpckhwd xmm3, xmm5 ; xmm3=(42 43 52 53 62 63 72 73)
movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4
movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6
; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33)
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73)
movdqa xmm0, xmm2 ; transpose coefficients(phase 1)
punpcklwd xmm2, xmm4 ; xmm2=(04 05 14 15 24 25 34 35)
punpckhwd xmm0, xmm4 ; xmm0=(44 45 54 55 64 65 74 75)
movdqa xmm3, xmm5 ; transpose coefficients(phase 1)
punpcklwd xmm5, xmm7 ; xmm5=(06 07 16 17 26 27 36 37)
punpckhwd xmm3, xmm7 ; xmm3=(46 47 56 57 66 67 76 77)
movdqa xmm4, xmm2 ; transpose coefficients(phase 2)
punpckldq xmm2, xmm5 ; xmm2=(04 05 06 07 14 15 16 17)
punpckhdq xmm4, xmm5 ; xmm4=(24 25 26 27 34 35 36 37)
movdqa xmm7, xmm0 ; transpose coefficients(phase 2)
punpckldq xmm0, xmm3 ; xmm0=(44 45 46 47 54 55 56 57)
punpckhdq xmm7, xmm3 ; xmm7=(64 65 66 67 74 75 76 77)
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33)
movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73)
movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37)
movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57)
movdqa xmm4, xmm6 ; transpose coefficients(phase 2)
punpckldq xmm6, xmm5 ; xmm6=(00 01 02 03 10 11 12 13)
punpckhdq xmm4, xmm5 ; xmm4=(20 21 22 23 30 31 32 33)
movdqa xmm0, xmm1 ; transpose coefficients(phase 2)
punpckldq xmm1, xmm3 ; xmm1=(40 41 42 43 50 51 52 53)
punpckhdq xmm0, xmm3 ; xmm0=(60 61 62 63 70 71 72 73)
movdqa xmm5, xmm6 ; transpose coefficients(phase 3)
punpcklqdq xmm6, xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0
punpckhqdq xmm5, xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1
movdqa xmm3, xmm0 ; transpose coefficients(phase 3)
punpcklqdq xmm0, xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6
punpckhqdq xmm3, xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7
movdqa xmm2, xmm5
movdqa xmm7, xmm6
psubw xmm5, xmm0 ; xmm5=data1-data6=tmp6
psubw xmm6, xmm3 ; xmm6=data0-data7=tmp7
paddw xmm2, xmm0 ; xmm2=data1+data6=tmp1
paddw xmm7, xmm3 ; xmm7=data0+data7=tmp0
movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37)
movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57)
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
movdqa xmm5, xmm4 ; transpose coefficients(phase 3)
punpcklqdq xmm4, xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2
punpckhqdq xmm5, xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3
movdqa xmm6, xmm1 ; transpose coefficients(phase 3)
punpcklqdq xmm1, xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4
punpckhqdq xmm6, xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5
movdqa xmm0, xmm5
movdqa xmm3, xmm4
paddw xmm5, xmm1 ; xmm5=data3+data4=tmp3
paddw xmm4, xmm6 ; xmm4=data2+data5=tmp2
psubw xmm0, xmm1 ; xmm0=data3-data4=tmp4
psubw xmm3, xmm6 ; xmm3=data2-data5=tmp5
; -- Even part
movdqa xmm1, xmm7
movdqa xmm6, xmm2
paddw xmm7, xmm5 ; xmm7=tmp10
paddw xmm2, xmm4 ; xmm2=tmp11
psubw xmm1, xmm5 ; xmm1=tmp13
psubw xmm6, xmm4 ; xmm6=tmp12
movdqa xmm5, xmm7
paddw xmm7, xmm2 ; xmm7=tmp10+tmp11
psubw xmm5, xmm2 ; xmm5=tmp10-tmp11
paddw xmm7, [GOTOFF(ebx,PW_DESCALE_P2X)]
paddw xmm5, [GOTOFF(ebx,PW_DESCALE_P2X)]
psraw xmm7, PASS1_BITS ; xmm7=data0
psraw xmm5, PASS1_BITS ; xmm5=data4
movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7
movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5
; (Original)
; z1 = (tmp12 + tmp13) * 0.541196100;
; data2 = z1 + tmp13 * 0.765366865;
; data6 = z1 + tmp12 * -1.847759065;
;
; (This implementation)
; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
movdqa xmm4, xmm1 ; xmm1=tmp13
movdqa xmm2, xmm1
punpcklwd xmm4, xmm6 ; xmm6=tmp12
punpckhwd xmm2, xmm6
movdqa xmm1, xmm4
movdqa xmm6, xmm2
pmaddwd xmm4, [GOTOFF(ebx,PW_F130_F054)] ; xmm4=data2L
pmaddwd xmm2, [GOTOFF(ebx,PW_F130_F054)] ; xmm2=data2H
pmaddwd xmm1, [GOTOFF(ebx,PW_F054_MF130)] ; xmm1=data6L
pmaddwd xmm6, [GOTOFF(ebx,PW_F054_MF130)] ; xmm6=data6H
paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P2)]
paddd xmm2, [GOTOFF(ebx,PD_DESCALE_P2)]
psrad xmm4, DESCALE_P2
psrad xmm2, DESCALE_P2
paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
paddd xmm6, [GOTOFF(ebx,PD_DESCALE_P2)]
psrad xmm1, DESCALE_P2
psrad xmm6, DESCALE_P2
packssdw xmm4, xmm2 ; xmm4=data2
packssdw xmm1, xmm6 ; xmm1=data6
movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4
movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1
; -- Odd part
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
movdqa xmm2, xmm0 ; xmm0=tmp4
movdqa xmm6, xmm3 ; xmm3=tmp5
paddw xmm2, xmm7 ; xmm2=z3
paddw xmm6, xmm5 ; xmm6=z4
; (Original)
; z5 = (z3 + z4) * 1.175875602;
; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
; z3 += z5; z4 += z5;
;
; (This implementation)
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
movdqa xmm4, xmm2
movdqa xmm1, xmm2
punpcklwd xmm4, xmm6
punpckhwd xmm1, xmm6
movdqa xmm2, xmm4
movdqa xmm6, xmm1
pmaddwd xmm4, [GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3L
pmaddwd xmm1, [GOTOFF(ebx,PW_MF078_F117)] ; xmm1=z3H
pmaddwd xmm2, [GOTOFF(ebx,PW_F117_F078)] ; xmm2=z4L
pmaddwd xmm6, [GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4H
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H
; (Original)
; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
;
; (This implementation)
; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
; data7 = tmp4 + z3; data5 = tmp5 + z4;
; data3 = tmp6 + z3; data1 = tmp7 + z4;
movdqa xmm4, xmm0
movdqa xmm1, xmm0
punpcklwd xmm4, xmm5
punpckhwd xmm1, xmm5
movdqa xmm0, xmm4
movdqa xmm5, xmm1
pmaddwd xmm4, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4L
pmaddwd xmm1, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm1=tmp4H
pmaddwd xmm0, [GOTOFF(ebx,PW_MF089_F060)] ; xmm0=tmp7L
pmaddwd xmm5, [GOTOFF(ebx,PW_MF089_F060)] ; xmm5=tmp7H
paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L
paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H
paddd xmm0, xmm2 ; xmm0=data1L
paddd xmm5, xmm6 ; xmm5=data1H
paddd xmm4, [GOTOFF(ebx,PD_DESCALE_P2)]
paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
psrad xmm4, DESCALE_P2
psrad xmm1, DESCALE_P2
paddd xmm0, [GOTOFF(ebx,PD_DESCALE_P2)]
paddd xmm5, [GOTOFF(ebx,PD_DESCALE_P2)]
psrad xmm0, DESCALE_P2
psrad xmm5, DESCALE_P2
packssdw xmm4, xmm1 ; xmm4=data7
packssdw xmm0, xmm5 ; xmm0=data1
movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4
movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0
movdqa xmm1, xmm3
movdqa xmm5, xmm3
punpcklwd xmm1, xmm7
punpckhwd xmm5, xmm7
movdqa xmm3, xmm1
movdqa xmm7, xmm5
pmaddwd xmm1, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5L
pmaddwd xmm5, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm5=tmp5H
pmaddwd xmm3, [GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6L
pmaddwd xmm7, [GOTOFF(ebx,PW_MF256_F050)] ; xmm7=tmp6H
paddd xmm1, xmm2 ; xmm1=data5L
paddd xmm5, xmm6 ; xmm5=data5H
paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L
paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H
paddd xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
paddd xmm5, [GOTOFF(ebx,PD_DESCALE_P2)]
psrad xmm1, DESCALE_P2
psrad xmm5, DESCALE_P2
paddd xmm3, [GOTOFF(ebx,PD_DESCALE_P2)]
paddd xmm7, [GOTOFF(ebx,PD_DESCALE_P2)]
psrad xmm3, DESCALE_P2
psrad xmm7, DESCALE_P2
packssdw xmm1, xmm5 ; xmm1=data5
packssdw xmm3, xmm7 ; xmm3=data3
movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1
movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3
; pop edi ; unused
; pop esi ; unused
; pop edx ; need not be preserved
; pop ecx ; unused
poppic ebx
mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 32

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save