diff options
author | DRC <dcommander@users.sourceforge.net> | 2014-08-23 15:47:51 +0000 |
---|---|---|
committer | DRC <dcommander@users.sourceforge.net> | 2014-08-23 15:47:51 +0000 |
commit | d729f4da9c86b7212912a7d59e49d061d0e61d5f (patch) | |
tree | 2f13a0d34b00c68f692b09bf71b6874d768d7f9e | |
parent | f5644c3498b3ebf7c8472818dcf1cf9ef7943bb9 (diff) | |
download | libjpeg-turbo-d729f4da9c86b7212912a7d59e49d061d0e61d5f.tar.gz |
ARM NEON SIMD support for YCC-to-RGB565 conversion, and optimizations to the existing YCC-to-RGB conversion code:
-----
https://github.com/ssvb/libjpeg-turbo/commit/aee36252be20054afce371a92406fc66ba6627b5.patch
From aee36252be20054afce371a92406fc66ba6627b5 Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@gmail.com>
Date: Wed, 13 Aug 2014 03:50:22 +0300
Subject: [PATCH] ARM: Faster NEON yuv->rgb conversion for Krait and Cortex-A15
The older code was developed and tested only on ARM Cortex-A8 and ARM Cortex-A9.
Tuning it for newer ARM processors can introduce some speed-up (up to 20%).
The performance of the inner loop (conversion of 8 pixels) improves from
~27 cycles down to ~22 cycles on Qualcomm Krait 300, and from ~20 cycles
down to ~18 cycles on ARM Cortex-A15.
The performance remains exactly the same on ARM Cortex-A7 (~58 cycles),
ARM Cortex-A8 (~25 cycles) and ARM Cortex-A9 (~30 cycles) processors.
Also use larger indentation in the source code for separating two independent
instruction streams.
-----
https://github.com/ssvb/libjpeg-turbo/commit/a5efdbf22ce9c1acd4b14a353cec863c2c57557e.patch
From a5efdbf22ce9c1acd4b14a353cec863c2c57557e Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@gmail.com>
Date: Wed, 13 Aug 2014 07:23:09 +0300
Subject: [PATCH] ARM: NEON optimized yuv->rgb565 conversion
The performance of the inner loop (conversion of 8 pixels):
* ARM Cortex-A7: ~55 cycles
* ARM Cortex-A8: ~28 cycles
* ARM Cortex-A9: ~32 cycles
* ARM Cortex-A15: ~20 cycles
* Qualcomm Krait: ~24 cycles
Based on the Linaro rgb565 patch from
https://sourceforge.net/p/libjpeg-turbo/patches/24/
but implements better instructions scheduling.
git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1385 632fc199-4ca6-4c93-a231-07263d6284db
-rw-r--r-- | ChangeLog.txt | 3 | ||||
-rw-r--r-- | jdcolor.c | 10 | ||||
-rw-r--r-- | jsimd.h | 6 | ||||
-rw-r--r-- | jsimd_none.c | 15 | ||||
-rw-r--r-- | simd/jsimd.h | 4 | ||||
-rw-r--r-- | simd/jsimd_arm.c | 31 | ||||
-rw-r--r-- | simd/jsimd_arm64.c | 13 | ||||
-rw-r--r-- | simd/jsimd_arm_neon.S | 70 | ||||
-rw-r--r-- | simd/jsimd_i386.c | 15 | ||||
-rw-r--r-- | simd/jsimd_mips.c | 15 | ||||
-rw-r--r-- | simd/jsimd_x86_64.c | 15 |
11 files changed, 172 insertions, 25 deletions
diff --git a/ChangeLog.txt b/ChangeLog.txt index c31a0075..166788b2 100644 --- a/ChangeLog.txt +++ b/ChangeLog.txt @@ -68,7 +68,8 @@ accuracy than the slow integer DCT/IDCT algorithms, and they are quite a bit slower. [8] Added a new output colorspace (JCS_RGB565) to the libjpeg API that allows -for decompressing JPEG images into RGB565 (16-bit) pixels. +for decompressing JPEG images into RGB565 (16-bit) pixels. If dithering is not +used, then this code path is SIMD-accelerated on ARM platforms. [9] Numerous obsolete features, such as support for non-ANSI compilers and support for the MS-DOS memory model, were removed from the libjpeg code, @@ -6,7 +6,7 @@ * Modified 2011 by Guido Vollbeding. * libjpeg-turbo Modifications: * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB - * Copyright (C) 2009, 2011-2012, D. R. Commander. + * Copyright (C) 2009, 2011-2012, 2014, D. R. Commander. * Copyright (C) 2013, Linaro Limited. * For conditions of distribution and use, see the accompanying README file. * @@ -657,8 +657,12 @@ jinit_color_deconverter (j_decompress_ptr cinfo) cinfo->out_color_components = 3; if (cinfo->dither_mode == JDITHER_NONE) { if (cinfo->jpeg_color_space == JCS_YCbCr) { - cconvert->pub.color_convert = ycc_rgb565_convert; - build_ycc_rgb_table(cinfo); + if (jsimd_can_ycc_rgb565()) + cconvert->pub.color_convert = jsimd_ycc_rgb565_convert; + else { + cconvert->pub.color_convert = ycc_rgb565_convert; + build_ycc_rgb_table(cinfo); + } } else if (cinfo->jpeg_color_space == JCS_GRAYSCALE) { cconvert->pub.color_convert = gray_rgb565_convert; } else if (cinfo->jpeg_color_space == JCS_RGB) { @@ -2,7 +2,7 @@ * jsimd.h * * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB - * Copyright 2011 D. R. Commander + * Copyright 2011, 2014 D. R. Commander * * Based on the x86 SIMD extension for IJG JPEG library, * Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -13,6 +13,7 @@ EXTERN(int) jsimd_can_rgb_ycc (void); EXTERN(int) jsimd_can_rgb_gray (void); EXTERN(int) jsimd_can_ycc_rgb (void); +EXTERN(int) jsimd_can_ycc_rgb565 (void); EXTERN(int) jsimd_c_can_null_convert (void); EXTERN(void) jsimd_rgb_ycc_convert @@ -24,6 +25,9 @@ EXTERN(void) jsimd_rgb_gray_convert EXTERN(void) jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_rgb565_convert + (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); EXTERN(void) jsimd_c_null_convert (j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows); diff --git a/jsimd_none.c b/jsimd_none.c index 96a9842a..34aefc9f 100644 --- a/jsimd_none.c +++ b/jsimd_none.c @@ -2,7 +2,7 @@ * jsimd_none.c * * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB - * Copyright 2009-2011 D. R. Commander + * Copyright 2009-2011, 2014 D. R. Commander * * Based on the x86 SIMD extension for IJG JPEG library, * Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -37,6 +37,12 @@ jsimd_can_ycc_rgb (void) } GLOBAL(int) +jsimd_can_ycc_rgb565 (void) +{ + return 0; +} + +GLOBAL(int) jsimd_c_can_null_convert (void) { return 0; @@ -64,6 +70,13 @@ jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, } GLOBAL(void) +jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ +} + +GLOBAL(void) jsimd_c_null_convert (j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows) diff --git a/simd/jsimd.h b/simd/jsimd.h index 609b91f5..c5abd458 100644 --- a/simd/jsimd.h +++ b/simd/jsimd.h @@ -4,6 +4,7 @@ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB * Copyright 2011 D. R. Commander * Copyright (C) 2013-2014, MIPS Technologies, Inc., California + * Copyright (C) 2014 Linaro Limited * * Based on the x86 SIMD extension for IJG JPEG library, * Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -249,6 +250,9 @@ EXTERN(void) jsimd_ycc_extxbgr_convert_neon EXTERN(void) jsimd_ycc_extxrgb_convert_neon (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_rgb565_convert_neon + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); EXTERN(void) jsimd_ycc_rgb_convert_mips_dspr2 (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, diff --git a/simd/jsimd_arm.c b/simd/jsimd_arm.c index aefb1e67..4cbcf2d5 100644 --- a/simd/jsimd_arm.c +++ b/simd/jsimd_arm.c @@ -2,7 +2,7 @@ * jsimd_arm.c * * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB - * Copyright 2009-2011, 2013 D. R. Commander + * Copyright 2009-2011, 2013-2014 D. R. Commander * * Based on the x86 SIMD extension for IJG JPEG library, * Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -175,6 +175,23 @@ jsimd_can_ycc_rgb (void) return 0; } +GLOBAL(int) +jsimd_can_ycc_rgb565 (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_ARM_NEON) + return 1; + + return 0; +} + GLOBAL(void) jsimd_rgb_ycc_convert (j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, @@ -251,7 +268,7 @@ jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, case JCS_EXT_ARGB: neonfct=jsimd_ycc_extxrgb_convert_neon; break; - default: + default: neonfct=jsimd_ycc_extrgb_convert_neon; break; } @@ -260,6 +277,16 @@ jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); } +GLOBAL(void) +jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + if (simd_support & JSIMD_ARM_NEON) + jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row, + output_buf, num_rows); +} + GLOBAL(int) jsimd_can_h2v2_downsample (void) { diff --git a/simd/jsimd_arm64.c b/simd/jsimd_arm64.c index 44225aa6..a346d165 100644 --- a/simd/jsimd_arm64.c +++ b/simd/jsimd_arm64.c @@ -95,6 +95,12 @@ jsimd_can_ycc_rgb (void) return 0; } +GLOBAL(int) +jsimd_can_ycc_rgb565 (void) +{ + return 0; +} + GLOBAL(void) jsimd_rgb_ycc_convert (j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, @@ -148,6 +154,13 @@ jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); } +GLOBAL(void) +jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ +} + GLOBAL(int) jsimd_can_h2v2_downsample (void) { diff --git a/simd/jsimd_arm_neon.S b/simd/jsimd_arm_neon.S index 44c61fdd..7e8e134c 100644 --- a/simd/jsimd_arm_neon.S +++ b/simd/jsimd_arm_neon.S @@ -4,6 +4,7 @@ * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). * All rights reserved. * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> + * Copyright (C) 2014 Linaro Limited. All Rights Reserved. * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages @@ -1346,6 +1347,19 @@ asm_function jsimd_idct_2x2_neon .else .error unsupported macroblock size .endif + .elseif \bpp == 16 + .if \size == 8 + vst1.16 {q15}, [RGB]! + .elseif \size == 4 + vst1.16 {d30}, [RGB]! + .elseif \size == 2 + vst1.16 {d31[0]}, [RGB]! + vst1.16 {d31[1]}, [RGB]! + .elseif \size == 1 + vst1.16 {d31[2]}, [RGB]! + .else + .error unsupported macroblock size + .endif .else .error unsupported bpp .endif @@ -1377,44 +1391,71 @@ asm_function jsimd_idct_2x2_neon vrshrn.s32 d25, q13, #14 vrshrn.s32 d28, q14, #14 vrshrn.s32 d29, q15, #14 - vaddw.u8 q10, q10, d0 + vaddw.u8 q11, q10, d0 vaddw.u8 q12, q12, d0 vaddw.u8 q14, q14, d0 - vqmovun.s16 d1\g_offs, q10 +.if \bpp != 16 + vqmovun.s16 d1\g_offs, q11 vqmovun.s16 d1\r_offs, q12 vqmovun.s16 d1\b_offs, q14 +.else /* rgb565 */ + vqshlu.s16 q13, q11, #8 + vqshlu.s16 q15, q12, #8 + vqshlu.s16 q14, q14, #8 + vsri.u16 q15, q13, #5 + vsri.u16 q15, q14, #11 +.endif .endm .macro do_yuv_to_rgb_stage2_store_load_stage1 - vld1.8 {d4}, [U, :64]! + /* "do_yuv_to_rgb_stage2" and "store" */ vrshrn.s32 d20, q10, #15 + /* "load" and "do_yuv_to_rgb_stage1" */ + pld [U, #64] vrshrn.s32 d21, q11, #15 + pld [V, #64] vrshrn.s32 d24, q12, #14 vrshrn.s32 d25, q13, #14 + vld1.8 {d4}, [U, :64]! vrshrn.s32 d28, q14, #14 vld1.8 {d5}, [V, :64]! vrshrn.s32 d29, q15, #14 - vaddw.u8 q10, q10, d0 - vaddw.u8 q12, q12, d0 - vaddw.u8 q14, q14, d0 - vqmovun.s16 d1\g_offs, q10 - vld1.8 {d0}, [Y, :64]! - vqmovun.s16 d1\r_offs, q12 - pld [U, #64] - pld [V, #64] - pld [Y, #64] - vqmovun.s16 d1\b_offs, q14 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ - do_store \bpp, 8 + vaddw.u8 q11, q10, d0 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ + vaddw.u8 q12, q12, d0 + vaddw.u8 q14, q14, d0 +.if \bpp != 16 /**************** rgb24/rgb32 *********************************/ + vqmovun.s16 d1\g_offs, q11 + pld [Y, #64] + vqmovun.s16 d1\r_offs, q12 + vld1.8 {d0}, [Y, :64]! + vqmovun.s16 d1\b_offs, q14 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ + do_store \bpp, 8 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ +.else /**************************** rgb565 ***********************************/ + vqshlu.s16 q13, q11, #8 + pld [Y, #64] + vqshlu.s16 q15, q12, #8 + vqshlu.s16 q14, q14, #8 + vld1.8 {d0}, [Y, :64]! + vmull.s16 q11, d7, d1[1] + vmlal.s16 q11, d9, d1[2] + vsri.u16 q15, q13, #5 + vmull.s16 q12, d8, d1[0] + vsri.u16 q15, q14, #11 + vmull.s16 q13, d9, d1[0] + vmull.s16 q14, d6, d1[3] + do_store \bpp, 8 + vmull.s16 q15, d7, d1[3] +.endif .endm .macro do_yuv_to_rgb @@ -1556,6 +1597,7 @@ generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3 +generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, 0, 0 .purgem do_load .purgem do_store diff --git a/simd/jsimd_i386.c b/simd/jsimd_i386.c index c173e740..25d06974 100644 --- a/simd/jsimd_i386.c +++ b/simd/jsimd_i386.c @@ -2,7 +2,7 @@ * jsimd_i386.c * * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB - * Copyright 2009-2011, 2013 D. R. Commander + * Copyright 2009-2011, 2013-2014 D. R. Commander * * Based on the x86 SIMD extension for IJG JPEG library, * Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -130,6 +130,12 @@ jsimd_can_ycc_rgb (void) return 0; } +GLOBAL(int) +jsimd_can_ycc_rgb565 (void) +{ + return 0; +} + GLOBAL(void) jsimd_rgb_ycc_convert (j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, @@ -280,6 +286,13 @@ jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, mmxfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); } +GLOBAL(void) +jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ +} + GLOBAL(int) jsimd_can_h2v2_downsample (void) { diff --git a/simd/jsimd_mips.c b/simd/jsimd_mips.c index e95023a2..abcd19f5 100644 --- a/simd/jsimd_mips.c +++ b/simd/jsimd_mips.c @@ -2,7 +2,7 @@ * jsimd_mips.c * * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB - * Copyright 2009-2011 D. R. Commander + * Copyright 2009-2011, 2014 D. R. Commander * Copyright (C) 2013-2014, MIPS Technologies, Inc., California * * Based on the x86 SIMD extension for IJG JPEG library, @@ -162,6 +162,12 @@ jsimd_can_ycc_rgb (void) } GLOBAL(int) +jsimd_can_ycc_rgb565 (void) +{ + return 0; +} + +GLOBAL(int) jsimd_c_can_null_convert (void) { init_simd(); @@ -300,6 +306,13 @@ jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, } GLOBAL(void) +jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ +} + +GLOBAL(void) jsimd_c_null_convert (j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows) diff --git a/simd/jsimd_x86_64.c b/simd/jsimd_x86_64.c index 87c9d56f..2c47a7f3 100644 --- a/simd/jsimd_x86_64.c +++ b/simd/jsimd_x86_64.c @@ -2,7 +2,7 @@ * jsimd_x86_64.c * * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB - * Copyright 2009-2011 D. R. Commander + * Copyright 2009-2011, 2014 D. R. Commander * * Based on the x86 SIMD extension for IJG JPEG library, * Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -80,6 +80,12 @@ jsimd_can_ycc_rgb (void) return 1; } +GLOBAL(int) +jsimd_can_ycc_rgb565 (void) +{ + return 0; +} + GLOBAL(void) jsimd_rgb_ycc_convert (j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, @@ -194,6 +200,13 @@ jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); } +GLOBAL(void) +jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ +} + GLOBAL(int) jsimd_can_h2v2_downsample (void) { |