aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDRC <dcommander@users.sourceforge.net>2014-08-23 15:47:51 +0000
committerDRC <dcommander@users.sourceforge.net>2014-08-23 15:47:51 +0000
commitd729f4da9c86b7212912a7d59e49d061d0e61d5f (patch)
tree2f13a0d34b00c68f692b09bf71b6874d768d7f9e
parentf5644c3498b3ebf7c8472818dcf1cf9ef7943bb9 (diff)
downloadlibjpeg-turbo-d729f4da9c86b7212912a7d59e49d061d0e61d5f.tar.gz
ARM NEON SIMD support for YCC-to-RGB565 conversion, and optimizations to the existing YCC-to-RGB conversion code:
----- https://github.com/ssvb/libjpeg-turbo/commit/aee36252be20054afce371a92406fc66ba6627b5.patch From aee36252be20054afce371a92406fc66ba6627b5 Mon Sep 17 00:00:00 2001 From: Siarhei Siamashka <siarhei.siamashka@gmail.com> Date: Wed, 13 Aug 2014 03:50:22 +0300 Subject: [PATCH] ARM: Faster NEON yuv->rgb conversion for Krait and Cortex-A15 The older code was developed and tested only on ARM Cortex-A8 and ARM Cortex-A9. Tuning it for newer ARM processors can introduce some speed-up (up to 20%). The performance of the inner loop (conversion of 8 pixels) improves from ~27 cycles down to ~22 cycles on Qualcomm Krait 300, and from ~20 cycles down to ~18 cycles on ARM Cortex-A15. The performance remains exactly the same on ARM Cortex-A7 (~58 cycles), ARM Cortex-A8 (~25 cycles) and ARM Cortex-A9 (~30 cycles) processors. Also use larger indentation in the source code for separating two independent instruction streams. ----- https://github.com/ssvb/libjpeg-turbo/commit/a5efdbf22ce9c1acd4b14a353cec863c2c57557e.patch From a5efdbf22ce9c1acd4b14a353cec863c2c57557e Mon Sep 17 00:00:00 2001 From: Siarhei Siamashka <siarhei.siamashka@gmail.com> Date: Wed, 13 Aug 2014 07:23:09 +0300 Subject: [PATCH] ARM: NEON optimized yuv->rgb565 conversion The performance of the inner loop (conversion of 8 pixels): * ARM Cortex-A7: ~55 cycles * ARM Cortex-A8: ~28 cycles * ARM Cortex-A9: ~32 cycles * ARM Cortex-A15: ~20 cycles * Qualcomm Krait: ~24 cycles Based on the Linaro rgb565 patch from https://sourceforge.net/p/libjpeg-turbo/patches/24/ but implements better instructions scheduling. git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1385 632fc199-4ca6-4c93-a231-07263d6284db
-rw-r--r--ChangeLog.txt3
-rw-r--r--jdcolor.c10
-rw-r--r--jsimd.h6
-rw-r--r--jsimd_none.c15
-rw-r--r--simd/jsimd.h4
-rw-r--r--simd/jsimd_arm.c31
-rw-r--r--simd/jsimd_arm64.c13
-rw-r--r--simd/jsimd_arm_neon.S70
-rw-r--r--simd/jsimd_i386.c15
-rw-r--r--simd/jsimd_mips.c15
-rw-r--r--simd/jsimd_x86_64.c15
11 files changed, 172 insertions, 25 deletions
diff --git a/ChangeLog.txt b/ChangeLog.txt
index c31a0075..166788b2 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -68,7 +68,8 @@ accuracy than the slow integer DCT/IDCT algorithms, and they are quite a bit
slower.
[8] Added a new output colorspace (JCS_RGB565) to the libjpeg API that allows
-for decompressing JPEG images into RGB565 (16-bit) pixels.
+for decompressing JPEG images into RGB565 (16-bit) pixels. If dithering is not
+used, then this code path is SIMD-accelerated on ARM platforms.
[9] Numerous obsolete features, such as support for non-ANSI compilers and
support for the MS-DOS memory model, were removed from the libjpeg code,
diff --git a/jdcolor.c b/jdcolor.c
index 6927e5ed..ffedabd5 100644
--- a/jdcolor.c
+++ b/jdcolor.c
@@ -6,7 +6,7 @@
* Modified 2011 by Guido Vollbeding.
* libjpeg-turbo Modifications:
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009, 2011-2012, D. R. Commander.
+ * Copyright (C) 2009, 2011-2012, 2014, D. R. Commander.
* Copyright (C) 2013, Linaro Limited.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -657,8 +657,12 @@ jinit_color_deconverter (j_decompress_ptr cinfo)
cinfo->out_color_components = 3;
if (cinfo->dither_mode == JDITHER_NONE) {
if (cinfo->jpeg_color_space == JCS_YCbCr) {
- cconvert->pub.color_convert = ycc_rgb565_convert;
- build_ycc_rgb_table(cinfo);
+ if (jsimd_can_ycc_rgb565())
+ cconvert->pub.color_convert = jsimd_ycc_rgb565_convert;
+ else {
+ cconvert->pub.color_convert = ycc_rgb565_convert;
+ build_ycc_rgb_table(cinfo);
+ }
} else if (cinfo->jpeg_color_space == JCS_GRAYSCALE) {
cconvert->pub.color_convert = gray_rgb565_convert;
} else if (cinfo->jpeg_color_space == JCS_RGB) {
diff --git a/jsimd.h b/jsimd.h
index d45fd700..f1f584b8 100644
--- a/jsimd.h
+++ b/jsimd.h
@@ -2,7 +2,7 @@
* jsimd.h
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright 2011 D. R. Commander
+ * Copyright 2011, 2014 D. R. Commander
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -13,6 +13,7 @@
EXTERN(int) jsimd_can_rgb_ycc (void);
EXTERN(int) jsimd_can_rgb_gray (void);
EXTERN(int) jsimd_can_ycc_rgb (void);
+EXTERN(int) jsimd_can_ycc_rgb565 (void);
EXTERN(int) jsimd_c_can_null_convert (void);
EXTERN(void) jsimd_rgb_ycc_convert
@@ -24,6 +25,9 @@ EXTERN(void) jsimd_rgb_gray_convert
EXTERN(void) jsimd_ycc_rgb_convert
(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_rgb565_convert
+ (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_c_null_convert
(j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows);
diff --git a/jsimd_none.c b/jsimd_none.c
index 96a9842a..34aefc9f 100644
--- a/jsimd_none.c
+++ b/jsimd_none.c
@@ -2,7 +2,7 @@
* jsimd_none.c
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright 2009-2011 D. R. Commander
+ * Copyright 2009-2011, 2014 D. R. Commander
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -37,6 +37,12 @@ jsimd_can_ycc_rgb (void)
}
GLOBAL(int)
+jsimd_can_ycc_rgb565 (void)
+{
+ return 0;
+}
+
+GLOBAL(int)
jsimd_c_can_null_convert (void)
{
return 0;
@@ -64,6 +70,13 @@ jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
}
GLOBAL(void)
+jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows)
+{
+}
+
+GLOBAL(void)
jsimd_c_null_convert (j_compress_ptr cinfo,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows)
diff --git a/simd/jsimd.h b/simd/jsimd.h
index 609b91f5..c5abd458 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -4,6 +4,7 @@
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright 2011 D. R. Commander
* Copyright (C) 2013-2014, MIPS Technologies, Inc., California
+ * Copyright (C) 2014 Linaro Limited
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -249,6 +250,9 @@ EXTERN(void) jsimd_ycc_extxbgr_convert_neon
EXTERN(void) jsimd_ycc_extxrgb_convert_neon
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_rgb565_convert_neon
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
EXTERN(void) jsimd_ycc_rgb_convert_mips_dspr2
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
diff --git a/simd/jsimd_arm.c b/simd/jsimd_arm.c
index aefb1e67..4cbcf2d5 100644
--- a/simd/jsimd_arm.c
+++ b/simd/jsimd_arm.c
@@ -2,7 +2,7 @@
* jsimd_arm.c
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright 2009-2011, 2013 D. R. Commander
+ * Copyright 2009-2011, 2013-2014 D. R. Commander
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -175,6 +175,23 @@ jsimd_can_ycc_rgb (void)
return 0;
}
+GLOBAL(int)
+jsimd_can_ycc_rgb565 (void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_ARM_NEON)
+ return 1;
+
+ return 0;
+}
+
GLOBAL(void)
jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
@@ -251,7 +268,7 @@ jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
case JCS_EXT_ARGB:
neonfct=jsimd_ycc_extxrgb_convert_neon;
break;
- default:
+ default:
neonfct=jsimd_ycc_extrgb_convert_neon;
break;
}
@@ -260,6 +277,16 @@ jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
}
+GLOBAL(void)
+jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows)
+{
+ if (simd_support & JSIMD_ARM_NEON)
+ jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
+ output_buf, num_rows);
+}
+
GLOBAL(int)
jsimd_can_h2v2_downsample (void)
{
diff --git a/simd/jsimd_arm64.c b/simd/jsimd_arm64.c
index 44225aa6..a346d165 100644
--- a/simd/jsimd_arm64.c
+++ b/simd/jsimd_arm64.c
@@ -95,6 +95,12 @@ jsimd_can_ycc_rgb (void)
return 0;
}
+GLOBAL(int)
+jsimd_can_ycc_rgb565 (void)
+{
+ return 0;
+}
+
GLOBAL(void)
jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
@@ -148,6 +154,13 @@ jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
}
+GLOBAL(void)
+jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows)
+{
+}
+
GLOBAL(int)
jsimd_can_h2v2_downsample (void)
{
diff --git a/simd/jsimd_arm_neon.S b/simd/jsimd_arm_neon.S
index 44c61fdd..7e8e134c 100644
--- a/simd/jsimd_arm_neon.S
+++ b/simd/jsimd_arm_neon.S
@@ -4,6 +4,7 @@
* Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
* All rights reserved.
* Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+ * Copyright (C) 2014 Linaro Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
@@ -1346,6 +1347,19 @@ asm_function jsimd_idct_2x2_neon
.else
.error unsupported macroblock size
.endif
+ .elseif \bpp == 16
+ .if \size == 8
+ vst1.16 {q15}, [RGB]!
+ .elseif \size == 4
+ vst1.16 {d30}, [RGB]!
+ .elseif \size == 2
+ vst1.16 {d31[0]}, [RGB]!
+ vst1.16 {d31[1]}, [RGB]!
+ .elseif \size == 1
+ vst1.16 {d31[2]}, [RGB]!
+ .else
+ .error unsupported macroblock size
+ .endif
.else
.error unsupported bpp
.endif
@@ -1377,44 +1391,71 @@ asm_function jsimd_idct_2x2_neon
vrshrn.s32 d25, q13, #14
vrshrn.s32 d28, q14, #14
vrshrn.s32 d29, q15, #14
- vaddw.u8 q10, q10, d0
+ vaddw.u8 q11, q10, d0
vaddw.u8 q12, q12, d0
vaddw.u8 q14, q14, d0
- vqmovun.s16 d1\g_offs, q10
+.if \bpp != 16
+ vqmovun.s16 d1\g_offs, q11
vqmovun.s16 d1\r_offs, q12
vqmovun.s16 d1\b_offs, q14
+.else /* rgb565 */
+ vqshlu.s16 q13, q11, #8
+ vqshlu.s16 q15, q12, #8
+ vqshlu.s16 q14, q14, #8
+ vsri.u16 q15, q13, #5
+ vsri.u16 q15, q14, #11
+.endif
.endm
.macro do_yuv_to_rgb_stage2_store_load_stage1
- vld1.8 {d4}, [U, :64]!
+ /* "do_yuv_to_rgb_stage2" and "store" */
vrshrn.s32 d20, q10, #15
+ /* "load" and "do_yuv_to_rgb_stage1" */
+ pld [U, #64]
vrshrn.s32 d21, q11, #15
+ pld [V, #64]
vrshrn.s32 d24, q12, #14
vrshrn.s32 d25, q13, #14
+ vld1.8 {d4}, [U, :64]!
vrshrn.s32 d28, q14, #14
vld1.8 {d5}, [V, :64]!
vrshrn.s32 d29, q15, #14
- vaddw.u8 q10, q10, d0
- vaddw.u8 q12, q12, d0
- vaddw.u8 q14, q14, d0
- vqmovun.s16 d1\g_offs, q10
- vld1.8 {d0}, [Y, :64]!
- vqmovun.s16 d1\r_offs, q12
- pld [U, #64]
- pld [V, #64]
- pld [Y, #64]
- vqmovun.s16 d1\b_offs, q14
vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
- do_store \bpp, 8
+ vaddw.u8 q11, q10, d0
vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
+ vaddw.u8 q12, q12, d0
+ vaddw.u8 q14, q14, d0
+.if \bpp != 16 /**************** rgb24/rgb32 *********************************/
+ vqmovun.s16 d1\g_offs, q11
+ pld [Y, #64]
+ vqmovun.s16 d1\r_offs, q12
+ vld1.8 {d0}, [Y, :64]!
+ vqmovun.s16 d1\b_offs, q14
vmull.s16 q11, d7, d1[1] /* multiply by -11277 */
vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */
+ do_store \bpp, 8
vmull.s16 q12, d8, d1[0] /* multiply by 22971 */
vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
+.else /**************************** rgb565 ***********************************/
+ vqshlu.s16 q13, q11, #8
+ pld [Y, #64]
+ vqshlu.s16 q15, q12, #8
+ vqshlu.s16 q14, q14, #8
+ vld1.8 {d0}, [Y, :64]!
+ vmull.s16 q11, d7, d1[1]
+ vmlal.s16 q11, d9, d1[2]
+ vsri.u16 q15, q13, #5
+ vmull.s16 q12, d8, d1[0]
+ vsri.u16 q15, q14, #11
+ vmull.s16 q13, d9, d1[0]
+ vmull.s16 q14, d6, d1[3]
+ do_store \bpp, 8
+ vmull.s16 q15, d7, d1[3]
+.endif
.endm
.macro do_yuv_to_rgb
@@ -1556,6 +1597,7 @@ generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2
generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0
generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1
generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
+generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, 0, 0
.purgem do_load
.purgem do_store
diff --git a/simd/jsimd_i386.c b/simd/jsimd_i386.c
index c173e740..25d06974 100644
--- a/simd/jsimd_i386.c
+++ b/simd/jsimd_i386.c
@@ -2,7 +2,7 @@
* jsimd_i386.c
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright 2009-2011, 2013 D. R. Commander
+ * Copyright 2009-2011, 2013-2014 D. R. Commander
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -130,6 +130,12 @@ jsimd_can_ycc_rgb (void)
return 0;
}
+GLOBAL(int)
+jsimd_can_ycc_rgb565 (void)
+{
+ return 0;
+}
+
GLOBAL(void)
jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
@@ -280,6 +286,13 @@ jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
mmxfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
}
+GLOBAL(void)
+jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows)
+{
+}
+
GLOBAL(int)
jsimd_can_h2v2_downsample (void)
{
diff --git a/simd/jsimd_mips.c b/simd/jsimd_mips.c
index e95023a2..abcd19f5 100644
--- a/simd/jsimd_mips.c
+++ b/simd/jsimd_mips.c
@@ -2,7 +2,7 @@
* jsimd_mips.c
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright 2009-2011 D. R. Commander
+ * Copyright 2009-2011, 2014 D. R. Commander
* Copyright (C) 2013-2014, MIPS Technologies, Inc., California
*
* Based on the x86 SIMD extension for IJG JPEG library,
@@ -162,6 +162,12 @@ jsimd_can_ycc_rgb (void)
}
GLOBAL(int)
+jsimd_can_ycc_rgb565 (void)
+{
+ return 0;
+}
+
+GLOBAL(int)
jsimd_c_can_null_convert (void)
{
init_simd();
@@ -300,6 +306,13 @@ jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
}
GLOBAL(void)
+jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows)
+{
+}
+
+GLOBAL(void)
jsimd_c_null_convert (j_compress_ptr cinfo,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows)
diff --git a/simd/jsimd_x86_64.c b/simd/jsimd_x86_64.c
index 87c9d56f..2c47a7f3 100644
--- a/simd/jsimd_x86_64.c
+++ b/simd/jsimd_x86_64.c
@@ -2,7 +2,7 @@
* jsimd_x86_64.c
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright 2009-2011 D. R. Commander
+ * Copyright 2009-2011, 2014 D. R. Commander
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -80,6 +80,12 @@ jsimd_can_ycc_rgb (void)
return 1;
}
+GLOBAL(int)
+jsimd_can_ycc_rgb565 (void)
+{
+ return 0;
+}
+
GLOBAL(void)
jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
@@ -194,6 +200,13 @@ jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
}
+GLOBAL(void)
+jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows)
+{
+}
+
GLOBAL(int)
jsimd_can_h2v2_downsample (void)
{