diff options
author | Hendrik Dahlkamp <hendrik@google.com> | 2013-01-23 18:27:37 -0800 |
---|---|---|
committer | Adam Hampson <ahampson@google.com> | 2013-01-28 15:39:41 -0800 |
commit | 33cfdeb7b267ab635413797fffb046b73272f7ec (patch) | |
tree | 8ff16b765a83ba911233a1d7bfa27cce9cee3b7c /files | |
parent | a88a10a6ed9f9801852929bac34bdf10510116f4 (diff) | |
download | libyuv-33cfdeb7b267ab635413797fffb046b73272f7ec.tar.gz |
Update libyuv to r397
Change-Id: I70f5a527de52ae8ae80b189873c9a094035dfa2c
Signed-off-by: Hendrik Dahlkamp <hendrik@google.com>
Diffstat (limited to 'files')
62 files changed, 27319 insertions, 8813 deletions
diff --git a/files/AUTHORS b/files/AUTHORS new file mode 100644 index 00000000..9686ac13 --- /dev/null +++ b/files/AUTHORS @@ -0,0 +1,4 @@ +# Names should be added to this file like so: +# Name or Organization <email address> + +Google Inc. diff --git a/files/codereview.settings b/files/codereview.settings new file mode 100644 index 00000000..11270bba --- /dev/null +++ b/files/codereview.settings @@ -0,0 +1,12 @@ +# This file is used by gcl to get repository specific information. +# The LibYuv code review is via WebRtc's code review +CODE_REVIEW_SERVER: webrtc-codereview.appspot.com +#CC_LIST: +#VIEW_VC: +#STATUS: +TRY_ON_UPLOAD: False +TRYSERVER_HTTP_HOST: webrtc-cb-linux-master.cbf.corp.google.com +TRYSERVER_HTTP_PORT: 9020 +#TRYSERVER_SVN_URL: +#GITCL_PREUPLOAD: +#GITCL_PREDCOMMIT: diff --git a/files/include/libyuv.h b/files/include/libyuv.h index 5a30e2d0..06f26aae 100644 --- a/files/include/libyuv.h +++ b/files/include/libyuv.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -8,17 +8,22 @@ * be found in the AUTHORS file in the root of the source tree. */ - -#ifndef LIBYUV_INCLUDE_LIBYUV_H_ -#define LIBYUV_INCLUDE_LIBYUV_H_ +#ifndef INCLUDE_LIBYUV_H_ // NOLINT +#define INCLUDE_LIBYUV_H_ #include "libyuv/basic_types.h" +#include "libyuv/compare.h" #include "libyuv/convert.h" +#include "libyuv/convert_argb.h" +#include "libyuv/convert_from.h" #include "libyuv/cpu_id.h" #include "libyuv/format_conversion.h" -#include "libyuv/general.h" #include "libyuv/planar_functions.h" #include "libyuv/rotate.h" +#include "libyuv/rotate_argb.h" #include "libyuv/scale.h" +#include "libyuv/scale_argb.h" +#include "libyuv/version.h" +#include "libyuv/video_common.h" -#endif // LIBYUV_INCLUDE_LIBYUV_H_ +#endif // INCLUDE_LIBYUV_H_ NOLINT diff --git a/files/include/libyuv/basic_types.h b/files/include/libyuv/basic_types.h index 5adc2bfd..9e9f2abc 100644 --- a/files/include/libyuv/basic_types.h +++ b/files/include/libyuv/basic_types.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -8,27 +8,18 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_ +#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_ // NOLINT #define INCLUDE_LIBYUV_BASIC_TYPES_H_ #include <stddef.h> // for NULL, size_t -#ifndef WIN32 +#if !(defined(_MSC_VER) && (_MSC_VER < 1600)) #include <stdint.h> // for uintptr_t #endif #ifndef INT_TYPES_DEFINED #define INT_TYPES_DEFINED #ifdef COMPILER_MSVC -typedef __int64 int64; -#else -typedef long long int64; -#endif /* COMPILER_MSVC */ -typedef int int32; -typedef short int16; -typedef char int8; - -#ifdef COMPILER_MSVC typedef unsigned __int64 uint64; typedef __int64 int64; #ifndef INT64_C @@ -38,9 +29,20 @@ typedef __int64 int64; #define UINT64_C(x) x ## UI64 #endif #define INT64_F "I64" -#else -typedef unsigned long long uint64; -typedef long long int64; +#else // COMPILER_MSVC +#ifdef __LP64__ +typedef unsigned long uint64; // NOLINT +typedef long int64; // NOLINT +#ifndef INT64_C +#define INT64_C(x) x ## L +#endif +#ifndef UINT64_C +#define UINT64_C(x) x ## UL +#endif +#define INT64_F "l" +#else // __LP64__ +typedef unsigned long long uint64; // NOLINT +typedef long long int64; // NOLINT #ifndef INT64_C #define INT64_C(x) x ## LL #endif @@ -48,10 +50,14 @@ typedef long long int64; #define UINT64_C(x) x ## ULL #endif #define INT64_F "ll" -#endif /* COMPILER_MSVC */ +#endif // __LP64__ +#endif // COMPILER_MSVC typedef unsigned int uint32; -typedef unsigned short uint16; +typedef int int32; +typedef unsigned short uint16; // NOLINT +typedef short int16; // NOLINT typedef unsigned char uint8; +typedef signed char int8; #endif // INT_TYPES_DEFINED // Detect compiler is for x86 or x64. @@ -59,10 +65,33 @@ typedef unsigned char uint8; defined(__i386__) || defined(_M_IX86) #define CPU_X86 1 #endif +// Detect compiler is for ARM. +#if defined(__arm__) || defined(_M_ARM) +#define CPU_ARM 1 +#endif -#define IS_ALIGNED(p, a) (0==(reinterpret_cast<uintptr_t>(p) & ((a)-1))) +#ifndef ALIGNP #define ALIGNP(p, t) \ - (reinterpret_cast<uint8*>(((reinterpret_cast<uintptr_t>(p) + \ - ((t)-1)) & ~((t)-1)))) + (reinterpret_cast<uint8*>(((reinterpret_cast<uintptr_t>(p) + \ + ((t) - 1)) & ~((t) - 1)))) +#endif + +#if !defined(LIBYUV_API) +#if defined(_WIN32) || defined(__CYGWIN__) +#if defined(LIBYUV_BUILDING_SHARED_LIBRARY) +#define LIBYUV_API __declspec(dllexport) +#elif defined(LIBYUV_USING_SHARED_LIBRARY) +#define LIBYUV_API __declspec(dllimport) +#else +#define LIBYUV_API +#endif // LIBYUV_BUILDING_SHARED_LIBRARY +#elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__APPLE__) && \ + (defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \ + defined(LIBYUV_USING_SHARED_LIBRARY)) +#define LIBYUV_API __attribute__ ((visibility ("default"))) +#else +#define LIBYUV_API +#endif // __GNUC__ +#endif // LIBYUV_API -#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_ +#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_ NOLINT diff --git a/files/include/libyuv/compare.h b/files/include/libyuv/compare.h new file mode 100644 index 00000000..5fd924b8 --- /dev/null +++ b/files/include/libyuv/compare.h @@ -0,0 +1,73 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_COMPARE_H_ // NOLINT +#define INCLUDE_LIBYUV_COMPARE_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Compute a hash for specified memory. Seed of 5381 recommended. +LIBYUV_API +uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed); + +// Sum Square Error - used to compute Mean Square Error or PSNR. +LIBYUV_API +uint64 ComputeSumSquareError(const uint8* src_a, + const uint8* src_b, int count); + +LIBYUV_API +uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a, + const uint8* src_b, int stride_b, + int width, int height); + +static const int kMaxPsnr = 128; + +LIBYUV_API +double SumSquareErrorToPsnr(uint64 sse, uint64 count); + +LIBYUV_API +double CalcFramePsnr(const uint8* src_a, int stride_a, + const uint8* src_b, int stride_b, + int width, int height); + +LIBYUV_API +double I420Psnr(const uint8* src_y_a, int stride_y_a, + const uint8* src_u_a, int stride_u_a, + const uint8* src_v_a, int stride_v_a, + const uint8* src_y_b, int stride_y_b, + const uint8* src_u_b, int stride_u_b, + const uint8* src_v_b, int stride_v_b, + int width, int height); + +LIBYUV_API +double CalcFrameSsim(const uint8* src_a, int stride_a, + const uint8* src_b, int stride_b, + int width, int height); + +LIBYUV_API +double I420Ssim(const uint8* src_y_a, int stride_y_a, + const uint8* src_u_a, int stride_u_a, + const uint8* src_v_a, int stride_v_a, + const uint8* src_y_b, int stride_y_b, + const uint8* src_u_b, int stride_u_b, + const uint8* src_v_b, int stride_v_b, + int width, int height); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_COMPARE_H_ NOLINT diff --git a/files/include/libyuv/convert.h b/files/include/libyuv/convert.h index fa3b6446..1d4b6a5b 100644 --- a/files/include/libyuv/convert.h +++ b/files/include/libyuv/convert.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -8,90 +8,243 @@ * be found in the AUTHORS file in the root of the source tree. */ - -#ifndef INCLUDE_LIBYUV_CONVERT_H_ +#ifndef INCLUDE_LIBYUV_CONVERT_H_ // NOLINT #define INCLUDE_LIBYUV_CONVERT_H_ #include "libyuv/basic_types.h" +// TODO(fbarchard): Remove the following headers includes. +#include "libyuv/convert_from.h" +#include "libyuv/planar_functions.h" +#include "libyuv/rotate.h" +#ifdef __cplusplus namespace libyuv { +extern "C" { +#endif + +// Alias. +#define I420ToI420 I420Copy + +// Copy I420 to I420. +LIBYUV_API +int I420Copy(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert I422 to I420. +LIBYUV_API +int I422ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); -int I420ToRGB24(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height); - -int I420ToARGB4444(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height); - -int I420ToRGB565(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height); - -int I420ToARGB1555(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height); - -int I420ToYUY2(const uint8* src_y, int src_stride_y, +// Convert I444 to I420. +LIBYUV_API +int I444ToI420(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, int width, int height); -int I420ToUYVY(const uint8* src_y, int src_stride_y, +// Convert I411 to I420. +LIBYUV_API +int I411ToI420(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, int width, int height); -// TODO(fbarchard): Deprecated - this is same as BG24ToARGB with -height -int RGB24ToARGB(const uint8* src_frame, int src_stride_frame, - uint8* dst_frame, int dst_stride_frame, - int width, int height); +// Convert I400 (grey) to I420. +LIBYUV_API +int I400ToI420(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); -int RGB24ToI420(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +// Convert NV12 to I420. Also used for NV21. +LIBYUV_API +int NV12ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); -int RAWToI420(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +// Convert M420 to I420. +LIBYUV_API +int M420ToI420(const uint8* src_m420, int src_stride_m420, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); -int ABGRToI420(const uint8* src_frame, int src_stride_frame, +// Convert Q420 to I420. +LIBYUV_API +int Q420ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_yuy2, int src_stride_yuy2, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int width, int height); -int BGRAToI420(const uint8* src_frame, int src_stride_frame, +// Convert YUY2 to I420. +LIBYUV_API +int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int width, int height); +// Convert UYVY to I420. +LIBYUV_API +int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert V210 to I420. +LIBYUV_API +int V210ToI420(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// ARGB little endian (bgra in memory) to I420. +LIBYUV_API int ARGBToI420(const uint8* src_frame, int src_stride_frame, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int width, int height); -int NV12ToRGB565(const uint8* src_y, int src_stride_y, - const uint8* src_uv, int src_stride_uv, - uint8* dst_frame, int dst_stride_frame, +// BGRA little endian (argb in memory) to I420. +LIBYUV_API +int BGRAToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// ABGR little endian (rgba in memory) to I420. +LIBYUV_API +int ABGRToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// RGBA little endian (abgr in memory) to I420. +LIBYUV_API +int RGBAToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// RGB little endian (bgr in memory) to I420. +LIBYUV_API +int RGB24ToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// RGB big endian (rgb in memory) to I420. +LIBYUV_API +int RAWToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// RGB16 (RGBP fourcc) little endian to I420. +LIBYUV_API +int RGB565ToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, int width, int height); -} // namespace libyuv +// RGB15 (RGBO fourcc) little endian to I420. +LIBYUV_API +int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); -#endif // INCLUDE_LIBYUV_CONVERT_H_ +// RGB12 (R444 fourcc) little endian to I420. +LIBYUV_API +int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +#ifdef HAVE_JPEG +// src_width/height provided by capture. +// dst_width/height for clipping determine final size. +LIBYUV_API +int MJPGToI420(const uint8* sample, size_t sample_size, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int src_width, int src_height, + int dst_width, int dst_height); +#endif + +// Note Bayer formats (BGGR) To I420 are in format_conversion.h + +// Convert camera sample to I420 with cropping, rotation and vertical flip. +// "src_size" is needed to parse MJPG. +// "dst_stride_y" number of bytes in a row of the dst_y plane. +// Normally this would be the same as dst_width, with recommended alignment +// to 16 bytes for better efficiency. +// If rotation of 90 or 270 is used, stride is affected. The caller should +// allocate the I420 buffer according to rotation. +// "dst_stride_u" number of bytes in a row of the dst_u plane. +// Normally this would be the same as (dst_width + 1) / 2, with +// recommended alignment to 16 bytes for better efficiency. +// If rotation of 90 or 270 is used, stride is affected. +// "crop_x" and "crop_y" are starting position for cropping. +// To center, crop_x = (src_width - dst_width) / 2 +// crop_y = (src_height - dst_height) / 2 +// "src_width" / "src_height" is size of src_frame in pixels. +// "src_height" can be negative indicating a vertically flipped image source. +// "dst_width" / "dst_height" is size of destination to crop to. +// Must be less than or equal to src_width/src_height +// Cropping parameters are pre-rotation. +// "rotation" can be 0, 90, 180 or 270. +// "format" is a fourcc. ie 'I420', 'YUY2' +// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure. +LIBYUV_API +int ConvertToI420(const uint8* src_frame, size_t src_size, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int crop_x, int crop_y, + int src_width, int src_height, + int dst_width, int dst_height, + RotationMode rotation, + uint32 format); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_CONVERT_H_ NOLINT diff --git a/files/include/libyuv/convert_argb.h b/files/include/libyuv/convert_argb.h new file mode 100644 index 00000000..86085252 --- /dev/null +++ b/files/include/libyuv/convert_argb.h @@ -0,0 +1,228 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_ // NOLINT +#define INCLUDE_LIBYUV_CONVERT_ARGB_H_ + +#include "libyuv/basic_types.h" +// TODO(fbarchard): Remove the following headers includes +#include "libyuv/convert_from.h" +#include "libyuv/planar_functions.h" +#include "libyuv/rotate.h" + +// TODO(fbarchard): This set of functions should exactly match convert.h +// Add missing V210 and Q420. +// TODO(fbarchard): Add tests. Create random content of right size and convert +// with C vs Opt and or to I420 and compare. +// TODO(fbarchard): Some of these functions lack parameter setting. + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Alias. +#define ARGBToARGB ARGBCopy + +// Copy ARGB to ARGB. +LIBYUV_API +int ARGBCopy(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I420 to ARGB. +LIBYUV_API +int I420ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I422 to ARGB. +LIBYUV_API +int I422ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I444 to ARGB. +LIBYUV_API +int I444ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I411 to ARGB. +LIBYUV_API +int I411ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I400 (grey) to ARGB. +LIBYUV_API +int I400ToARGB(const uint8* src_y, int src_stride_y, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I400 to ARGB. Reverse of ARGBToI400. +LIBYUV_API +int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert NV12 to ARGB. +LIBYUV_API +int NV12ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert NV21 to ARGB. +LIBYUV_API +int NV21ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_vu, int src_stride_vu, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert M420 to ARGB. +LIBYUV_API +int M420ToARGB(const uint8* src_m420, int src_stride_m420, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// TODO(fbarchard): Convert Q420 to ARGB. +// LIBYUV_API +// int Q420ToARGB(const uint8* src_y, int src_stride_y, +// const uint8* src_yuy2, int src_stride_yuy2, +// uint8* dst_argb, int dst_stride_argb, +// int width, int height); + +// Convert YUY2 to ARGB. +LIBYUV_API +int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert UYVY to ARGB. +LIBYUV_API +int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// TODO(fbarchard): Convert V210 to ARGB. +// LIBYUV_API +// int V210ToARGB(const uint8* src_uyvy, int src_stride_uyvy, +// uint8* dst_argb, int dst_stride_argb, +// int width, int height); + +// BGRA little endian (argb in memory) to ARGB. +LIBYUV_API +int BGRAToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// ABGR little endian (rgba in memory) to ARGB. +LIBYUV_API +int ABGRToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// RGBA little endian (abgr in memory) to ARGB. +LIBYUV_API +int RGBAToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Deprecated function name. +#define BG24ToARGB RGB24ToARGB + +// RGB little endian (bgr in memory) to ARGB. +LIBYUV_API +int RGB24ToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// RGB big endian (rgb in memory) to ARGB. +LIBYUV_API +int RAWToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// RGB16 (RGBP fourcc) little endian to ARGB. +LIBYUV_API +int RGB565ToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// RGB15 (RGBO fourcc) little endian to ARGB. +LIBYUV_API +int ARGB1555ToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// RGB12 (R444 fourcc) little endian to ARGB. +LIBYUV_API +int ARGB4444ToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +#ifdef HAVE_JPEG +// src_width/height provided by capture +// dst_width/height for clipping determine final size. +LIBYUV_API +int MJPGToARGB(const uint8* sample, size_t sample_size, + uint8* dst_argb, int dst_stride_argb, + int src_width, int src_height, + int dst_width, int dst_height); +#endif + +// Note Bayer formats (BGGR) to ARGB are in format_conversion.h. + +// Convert camera sample to ARGB with cropping, rotation and vertical flip. +// "src_size" is needed to parse MJPG. +// "dst_stride_argb" number of bytes in a row of the dst_argb plane. +// Normally this would be the same as dst_width, with recommended alignment +// to 16 bytes for better efficiency. +// If rotation of 90 or 270 is used, stride is affected. The caller should +// allocate the I420 buffer according to rotation. +// "dst_stride_u" number of bytes in a row of the dst_u plane. +// Normally this would be the same as (dst_width + 1) / 2, with +// recommended alignment to 16 bytes for better efficiency. +// If rotation of 90 or 270 is used, stride is affected. +// "crop_x" and "crop_y" are starting position for cropping. +// To center, crop_x = (src_width - dst_width) / 2 +// crop_y = (src_height - dst_height) / 2 +// "src_width" / "src_height" is size of src_frame in pixels. +// "src_height" can be negative indicating a vertically flipped image source. +// "dst_width" / "dst_height" is size of destination to crop to. +// Must be less than or equal to src_width/src_height +// Cropping parameters are pre-rotation. +// "rotation" can be 0, 90, 180 or 270. +// "format" is a fourcc. ie 'I420', 'YUY2' +// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure. +LIBYUV_API +int ConvertToARGB(const uint8* src_frame, size_t src_size, + uint8* dst_argb, int dst_stride_argb, + int crop_x, int crop_y, + int src_width, int src_height, + int dst_width, int dst_height, + RotationMode rotation, + uint32 format); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_CONVERT_ARGB_H_ NOLINT diff --git a/files/include/libyuv/convert_from.h b/files/include/libyuv/convert_from.h new file mode 100644 index 00000000..4eae950c --- /dev/null +++ b/files/include/libyuv/convert_from.h @@ -0,0 +1,165 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_ // NOLINT +#define INCLUDE_LIBYUV_CONVERT_FROM_H_ + +#include "libyuv/basic_types.h" +#include "libyuv/rotate.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// See Also convert.h for conversions from formats to I420. + +// I420Copy in convert to I420ToI420. + +LIBYUV_API +int I420ToI422(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +LIBYUV_API +int I420ToI444(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +LIBYUV_API +int I420ToI411(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21. +LIBYUV_API +int I400Copy(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + int width, int height); + +// TODO(fbarchard): I420ToNV12 +// TODO(fbarchard): I420ToM420 +// TODO(fbarchard): I420ToQ420 + +LIBYUV_API +int I420ToYUY2(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +LIBYUV_API +int I420ToUYVY(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +LIBYUV_API +int I420ToV210(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +LIBYUV_API +int I420ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +LIBYUV_API +int I420ToBGRA(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +LIBYUV_API +int I420ToABGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +LIBYUV_API +int I420ToRGBA(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_rgba, int dst_stride_rgba, + int width, int height); + +LIBYUV_API +int I420ToRGB24(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +LIBYUV_API +int I420ToRAW(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +LIBYUV_API +int I420ToRGB565(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +LIBYUV_API +int I420ToARGB1555(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +LIBYUV_API +int I420ToARGB4444(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +// Note Bayer formats (BGGR) To I420 are in format_conversion.h. + +// Convert I420 to specified format. +// "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the +// buffer has contiguous rows. Can be negative. A multiple of 16 is optimal. +LIBYUV_API +int ConvertFromI420(const uint8* y, int y_stride, + const uint8* u, int u_stride, + const uint8* v, int v_stride, + uint8* dst_sample, int dst_sample_stride, + int width, int height, + uint32 format); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_CONVERT_FROM_H_ NOLINT diff --git a/files/include/libyuv/cpu_id.h b/files/include/libyuv/cpu_id.h index c1000e86..0914f1d2 100644 --- a/files/include/libyuv/cpu_id.h +++ b/files/include/libyuv/cpu_id.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -8,28 +8,63 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_CPU_ID_H_ +#ifndef INCLUDE_LIBYUV_CPU_ID_H_ // NOLINT #define INCLUDE_LIBYUV_CPU_ID_H_ +#include "libyuv/basic_types.h" + +#ifdef __cplusplus namespace libyuv { +extern "C" { +#endif -// These flags are only valid on x86 processors -static const int kCpuHasSSE2 = 1; -static const int kCpuHasSSSE3 = 2; +// Internal flag to indicate cpuid is initialized. +static const int kCpuInitialized = 0x1; -// These flags are only valid on ARM processors -static const int kCpuHasNEON = 4; +// These flags are only valid on ARM processors. +static const int kCpuHasARM = 0x2; +static const int kCpuHasNEON = 0x4; +// 0x8 reserved for future ARM flag. -// Internal flag to indicate cpuid is initialized. -static const int kCpuInitialized = 8; +// These flags are only valid on x86 processors. +static const int kCpuHasX86 = 0x10; +static const int kCpuHasSSE2 = 0x20; +static const int kCpuHasSSSE3 = 0x40; +static const int kCpuHasSSE41 = 0x80; +static const int kCpuHasSSE42 = 0x100; +static const int kCpuHasAVX = 0x200; +static const int kCpuHasAVX2 = 0x400; + +// Internal function used to auto-init. +LIBYUV_API +int InitCpuFlags(void); + +// Internal function for parsing /proc/cpuinfo. +LIBYUV_API +int ArmCpuCaps(const char* cpuinfo_name); // Detect CPU has SSE2 etc. -bool TestCpuFlag(int flag); +// Test_flag parameter should be one of kCpuHas constants above. +// returns non-zero if instruction set is detected +static __inline int TestCpuFlag(int test_flag) { + LIBYUV_API extern int cpu_info_; + return (cpu_info_ ? cpu_info_ : InitCpuFlags()) & test_flag; +} // For testing, allow CPU flags to be disabled. -// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3. -1 to enable all. +// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3. +// MaskCpuFlags(-1) to enable all cpu specific optimizations. +// MaskCpuFlags(0) to disable all cpu specific optimizations. +LIBYUV_API void MaskCpuFlags(int enable_flags); +// Low level cpuid for X86. Returns zeros on other CPUs. +LIBYUV_API +void CpuId(int cpu_info[4], int info_type); + +#ifdef __cplusplus +} // extern "C" } // namespace libyuv +#endif -#endif // INCLUDE_LIBYUV_CPU_ID_H_ +#endif // INCLUDE_LIBYUV_CPU_ID_H_ NOLINT diff --git a/files/include/libyuv/format_conversion.h b/files/include/libyuv/format_conversion.h index d3d36f38..06bd387f 100644 --- a/files/include/libyuv/format_conversion.h +++ b/files/include/libyuv/format_conversion.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -8,34 +8,161 @@ * be found in the AUTHORS file in the root of the source tree. */ - -#ifndef INCLUDE_LIBYUV_FORMATCONVERSION_H_ +#ifndef INCLUDE_LIBYUV_FORMATCONVERSION_H_ // NOLINT #define INCLUDE_LIBYUV_FORMATCONVERSION_H_ #include "libyuv/basic_types.h" +#ifdef __cplusplus namespace libyuv { +extern "C" { +#endif + +// Convert Bayer RGB formats to I420. +LIBYUV_API +int BayerBGGRToI420(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +LIBYUV_API +int BayerGBRGToI420(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +LIBYUV_API +int BayerGRBGToI420(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +LIBYUV_API +int BayerRGGBToI420(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Temporary API mapper. +#define BayerRGBToI420(b, bs, f, y, ys, u, us, v, vs, w, h) \ + BayerToI420(b, bs, y, ys, u, us, v, vs, w, h, f) + +LIBYUV_API +int BayerToI420(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height, + uint32 src_fourcc_bayer); + +// Convert I420 to Bayer RGB formats. +LIBYUV_API +int I420ToBayerBGGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +LIBYUV_API +int I420ToBayerGBRG(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +LIBYUV_API +int I420ToBayerGRBG(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +LIBYUV_API +int I420ToBayerRGGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +// Temporary API mapper. +#define I420ToBayerRGB(y, ys, u, us, v, vs, b, bs, f, w, h) \ + I420ToBayer(y, ys, u, us, v, vs, b, bs, w, h, f) + +LIBYUV_API +int I420ToBayer(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height, + uint32 dst_fourcc_bayer); + +// Convert Bayer RGB formats to ARGB. +LIBYUV_API +int BayerBGGRToARGB(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +LIBYUV_API +int BayerGBRGToARGB(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +LIBYUV_API +int BayerGRBGToARGB(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +LIBYUV_API +int BayerRGGBToARGB(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Temporary API mapper. +#define BayerRGBToARGB(b, bs, f, a, as, w, h) BayerToARGB(b, bs, a, as, w, h, f) + +LIBYUV_API +int BayerToARGB(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_argb, int dst_stride_argb, + int width, int height, + uint32 src_fourcc_bayer); + +// Converts ARGB to Bayer RGB formats. +LIBYUV_API +int ARGBToBayerBGGR(const uint8* src_argb, int src_stride_argb, + uint8* dst_bayer, int dst_stride_bayer, + int width, int height); + +LIBYUV_API +int ARGBToBayerGBRG(const uint8* src_argb, int src_stride_argb, + uint8* dst_bayer, int dst_stride_bayer, + int width, int height); + +LIBYUV_API +int ARGBToBayerGRBG(const uint8* src_argb, int src_stride_argb, + uint8* dst_bayer, int dst_stride_bayer, + int width, int height); + +LIBYUV_API +int ARGBToBayerRGGB(const uint8* src_argb, int src_stride_argb, + uint8* dst_bayer, int dst_stride_bayer, + int width, int height); + +// Temporary API mapper. +#define ARGBToBayerRGB(a, as, b, bs, f, w, h) ARGBToBayer(b, bs, a, as, w, h, f) -// Converts any Bayer RGB format to I420. -int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer, - uint32 src_fourcc_bayer, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); - -// Converts any Bayer RGB format to ARGB. -int BayerRGBToARGB(const uint8* src_bayer, int src_stride_bayer, - uint32 src_fourcc_bayer, - uint8* dst_rgb, int dst_stride_rgb, - int width, int height); - -// Converts ARGB to any Bayer RGB format. -int ARGBToBayerRGB(const uint8* src_rgb, int src_stride_rgb, - uint8* dst_bayer, int dst_stride_bayer, - uint32 dst_fourcc_bayer, - int width, int height); +LIBYUV_API +int ARGBToBayer(const uint8* src_argb, int src_stride_argb, + uint8* dst_bayer, int dst_stride_bayer, + int width, int height, + uint32 dst_fourcc_bayer); +#ifdef __cplusplus +} // extern "C" } // namespace libyuv +#endif -#endif // INCLUDE_LIBYUV_FORMATCONVERSION_H_ +#endif // INCLUDE_LIBYUV_FORMATCONVERSION_H_ NOLINT diff --git a/files/include/libyuv/general.h b/files/include/libyuv/general.h deleted file mode 100644 index 58943c86..00000000 --- a/files/include/libyuv/general.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -/* - * General operations on YUV images. - */ - -#ifndef INCLUDE_LIBYUV_GENERAL_H_ -#define INCLUDE_LIBYUV_GENERAL_H_ - -#include "libyuv/basic_types.h" - -namespace libyuv { - -// I420 mirror -int -I420Mirror(const uint8* src_yplane, int src_ystride, - const uint8* src_uplane, int src_ustride, - const uint8* src_vplane, int src_vstride, - uint8* dst_yplane, int dst_ystride, - uint8* dst_uplane, int dst_ustride, - uint8* dst_vplane, int dst_vstride, - int width, int height); - -// Crop/Pad I420 frame to match required dimensions. -int -I420CropPad(const uint8* src_frame, int src_width, - int src_height, uint8* dst_frame, - int dst_width, int dst_height); - -// I420 Crop - crop a rectangle from image -int -I420Crop(uint8* frame, - int src_width, int src_height, - int dst_width, int dst_height); - -} // namespace libyuv - -#endif // INCLUDE_LIBYUV_GENERAL_H_ diff --git a/files/include/libyuv/mjpeg_decoder.h b/files/include/libyuv/mjpeg_decoder.h new file mode 100644 index 00000000..67090cf0 --- /dev/null +++ b/files/include/libyuv/mjpeg_decoder.h @@ -0,0 +1,188 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_ // NOLINT +#define INCLUDE_LIBYUV_MJPEG_DECODER_H_ + +#include "libyuv/basic_types.h" + +// NOTE: For a simplified public API use convert.h MJPGToI420(). + +struct jpeg_common_struct; +struct jpeg_decompress_struct; +struct jpeg_source_mgr; + +namespace libyuv { + +static const uint32 kUnknownDataSize = 0xFFFFFFFF; + +enum JpegSubsamplingType { + kJpegYuv420, + kJpegYuv422, + kJpegYuv411, + kJpegYuv444, + kJpegYuv400, + kJpegUnknown +}; + +struct SetJmpErrorMgr; + +// MJPEG ("Motion JPEG") is a pseudo-standard video codec where the frames are +// simply independent JPEG images with a fixed huffman table (which is omitted). +// It is rarely used in video transmission, but is common as a camera capture +// format, especially in Logitech devices. This class implements a decoder for +// MJPEG frames. +// +// See http://tools.ietf.org/html/rfc2435 +class MJpegDecoder { + public: + typedef void (*CallbackFunction)(void* opaque, + const uint8* const* data, + const int* strides, + int rows); + + static const int kColorSpaceUnknown; + static const int kColorSpaceGrayscale; + static const int kColorSpaceRgb; + static const int kColorSpaceYCbCr; + static const int kColorSpaceCMYK; + static const int kColorSpaceYCCK; + + MJpegDecoder(); + ~MJpegDecoder(); + + // Loads a new frame, reads its headers, and determines the uncompressed + // image format. Returns true if image looks valid and format is supported. + // If return value is true, then the values for all the following getters + // are populated. + // src_len is the size of the compressed mjpeg frame in bytes. + bool LoadFrame(const uint8* src, size_t src_len); + + // Returns width of the last loaded frame in pixels. + int GetWidth(); + + // Returns height of the last loaded frame in pixels. + int GetHeight(); + + // Returns format of the last loaded frame. The return value is one of the + // kColorSpace* constants. + int GetColorSpace(); + + // Number of color components in the color space. + int GetNumComponents(); + + // Sample factors of the n-th component. + int GetHorizSampFactor(int component); + + int GetVertSampFactor(int component); + + int GetHorizSubSampFactor(int component); + + int GetVertSubSampFactor(int component); + + // Public for testability. + int GetImageScanlinesPerImcuRow(); + + // Public for testability. + int GetComponentScanlinesPerImcuRow(int component); + + // Width of a component in bytes. + int GetComponentWidth(int component); + + // Height of a component. + int GetComponentHeight(int component); + + // Width of a component in bytes with padding for DCTSIZE. Public for testing. + int GetComponentStride(int component); + + // Size of a component in bytes. + int GetComponentSize(int component); + + // Call this after LoadFrame() if you decide you don't want to decode it + // after all. + bool UnloadFrame(); + + // Decodes the entire image into a one-buffer-per-color-component format. + // dst_width must match exactly. dst_height must be <= to image height; if + // less, the image is cropped. "planes" must have size equal to at least + // GetNumComponents() and they must point to non-overlapping buffers of size + // at least GetComponentSize(i). The pointers in planes are incremented + // to point to after the end of the written data. + // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded. + bool DecodeToBuffers(uint8** planes, int dst_width, int dst_height); + + // Decodes the entire image and passes the data via repeated calls to a + // callback function. Each call will get the data for a whole number of + // image scanlines. + // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded. + bool DecodeToCallback(CallbackFunction fn, void* opaque, + int dst_width, int dst_height); + + // The helper function which recognizes the jpeg sub-sampling type. + static JpegSubsamplingType JpegSubsamplingTypeHelper( + int* subsample_x, int* subsample_y, int number_of_components); + + private: + struct Buffer { + const uint8* data; + int len; + }; + + struct BufferVector { + Buffer* buffers; + int len; + int pos; + }; + + // Methods that are passed to jpeglib. + static int fill_input_buffer(jpeg_decompress_struct* cinfo); + static void init_source(jpeg_decompress_struct* cinfo); + static void skip_input_data(jpeg_decompress_struct* cinfo, + long num_bytes); // NOLINT + static void term_source(jpeg_decompress_struct* cinfo); + + static void ErrorHandler(jpeg_common_struct* cinfo); + + void AllocOutputBuffers(int num_outbufs); + void DestroyOutputBuffers(); + + bool StartDecode(); + bool FinishDecode(); + + void SetScanlinePointers(uint8** data); + bool DecodeImcuRow(); + + int GetComponentScanlinePadding(int component); + + // A buffer holding the input data for a frame. + Buffer buf_; + BufferVector buf_vec_; + + jpeg_decompress_struct* decompress_struct_; + jpeg_source_mgr* source_mgr_; + SetJmpErrorMgr* error_mgr_; + + // true iff at least one component has scanline padding. (i.e., + // GetComponentScanlinePadding() != 0.) + bool has_scanline_padding_; + + // Temporaries used to point to scanline outputs. + int num_outbufs_; // Outermost size of all arrays below. + uint8*** scanlines_; + int* scanlines_sizes_; + // Temporary buffer used for decoding when we can't decode directly to the + // output buffers. Large enough for just one iMCU row. + uint8** databuf_; + int* databuf_strides_; +}; + +} // namespace libyuv + +#endif // INCLUDE_LIBYUV_MJPEG_DECODER_H_ NOLINT diff --git a/files/include/libyuv/planar_functions.h b/files/include/libyuv/planar_functions.h index 9c0a10a3..7e43dabb 100644 --- a/files/include/libyuv/planar_functions.h +++ b/files/include/libyuv/planar_functions.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -8,155 +8,331 @@ * be found in the AUTHORS file in the root of the source tree. */ - -#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ +#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ // NOLINT #define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ #include "libyuv/basic_types.h" +// TODO(fbarchard): Remove the following headers includes. +#include "libyuv/convert.h" +#include "libyuv/convert_argb.h" + +#ifdef __cplusplus namespace libyuv { +extern "C" { +#endif -// Copy I420 to I420. -int I420Copy(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +LIBYUV_API +void SetPlane(uint8* dst_y, int dst_stride_y, + int width, int height, + uint32 value); -// Draw a rectangle into I420 -int I420Rect(uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int x, int y, - int width, int height, - int value_y, int value_u, int value_v); +// Alias. +#define I400ToI400 CopyPlane -// Convert I422 to I420. Used by MJPG. -int I422ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, +// Copy a plane of data (I420 to I400). +LIBYUV_API +void CopyPlane(const uint8* src_y, int src_stride_y, uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, int width, int height); -// Convert NV12 to I420. Also used for NV21. -int NV12ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_uv, int src_stride_uv, +// Convert YUY2 to I422. +LIBYUV_API +int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int width, int height); -// Convert NV12 to I420. Deprecated. -int NV12ToI420(const uint8* src_y, - const uint8* src_uv, int src_stride, +// Convert UYVY to I422. +int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int width, int height); -// Convert Q420 to I420. -int Q420ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_yuy2, int src_stride_yuy2, +// Convert I420 to I400. (calls CopyPlane ignoring u/v). +LIBYUV_API +int I420ToI400(const uint8* src_y, int src_stride_y, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int width, int height); -// Convert M420 to I420. -int M420ToI420(const uint8* src_m420, int src_stride_m420, +// I420 mirror. +LIBYUV_API +int I420Mirror(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int width, int height); -// Convert YUY2 to I420. -int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, +// ARGB mirror. +LIBYUV_API +int ARGBMirror(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert NV12 to RGB565. +LIBYUV_API +int NV12ToRGB565(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_rgb565, int dst_stride_rgb565, + int width, int height); + +// Convert NV21 to RGB565. +LIBYUV_API +int NV21ToRGB565(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_rgb565, int dst_stride_rgb565, + int width, int height); + +// Aliases. +#define ARGBToBGRA BGRAToARGB +#define ARGBToABGR ABGRToARGB + +// Convert ARGB To RGBA. +LIBYUV_API +int ARGBToRGBA(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert ARGB To RGB24. +LIBYUV_API +int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgb24, int dst_stride_rgb24, + int width, int height); + +// Convert ARGB To RAW. +LIBYUV_API +int ARGBToRAW(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgb, int dst_stride_rgb, + int width, int height); + +// Convert ARGB To RGB565. +LIBYUV_API +int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgb565, int dst_stride_rgb565, + int width, int height); + +// Convert ARGB To ARGB1555. +LIBYUV_API +int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb1555, int dst_stride_argb1555, + int width, int height); + +// Convert ARGB To ARGB4444. +LIBYUV_API +int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb4444, int dst_stride_argb4444, + int width, int height); + +// Convert ARGB to I400. +LIBYUV_API +int ARGBToI400(const uint8* src_argb, int src_stride_argb, uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, int width, int height); -// Convert UYVY to I420. -int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, +// ARGB little endian (bgra in memory) to I422. +LIBYUV_API +int ARGBToI422(const uint8* src_frame, int src_stride_frame, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int width, int height); -// Convert I420 to ARGB. -int I420ToARGB(const uint8* src_y, int src_stride_y, +// I422ToARGB is in convert_argb.h +// Convert I422 to BGRA. +LIBYUV_API +int I422ToBGRA(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, + uint8* dst_bgra, int dst_stride_bgra, int width, int height); -// Convert I420 to BGRA. -int I420ToBGRA(const uint8* src_y, int src_stride_y, +// Convert I422 to ABGR. +LIBYUV_API +int I422ToABGR(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, + uint8* dst_abgr, int dst_stride_abgr, int width, int height); -// Convert I420 to ABGR. -int I420ToABGR(const uint8* src_y, int src_stride_y, +// Convert I422 to RGBA. +LIBYUV_API +int I422ToRGBA(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, + uint8* dst_rgba, int dst_stride_rgba, + int width, int height); + +// Draw a rectangle into I420. +LIBYUV_API +int I420Rect(uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int x, int y, int width, int height, + int value_y, int value_u, int value_v); + +// Draw a rectangle into ARGB. +LIBYUV_API +int ARGBRect(uint8* dst_argb, int dst_stride_argb, + int x, int y, int width, int height, uint32 value); + +// Convert ARGB to gray scale ARGB. +LIBYUV_API +int ARGBGrayTo(const uint8* src_argb, int src_stride_argb, uint8* dst_argb, int dst_stride_argb, int width, int height); -// Convert I422 to ARGB. -int I422ToARGB(const uint8* src_y, int src_stride_y, +// Make a rectangle of ARGB gray scale. +LIBYUV_API +int ARGBGray(uint8* dst_argb, int dst_stride_argb, + int x, int y, int width, int height); + +// Make a rectangle of ARGB Sepia tone. +LIBYUV_API +int ARGBSepia(uint8* dst_argb, int dst_stride_argb, + int x, int y, int width, int height); + +// Apply a matrix rotation to each ARGB pixel. +// matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1. +// The first 4 coefficients apply to B, G, R, A and produce B of the output. +// The next 4 coefficients apply to B, G, R, A and produce G of the output. +// The last 4 coefficients apply to B, G, R, A and produce R of the output. +LIBYUV_API +int ARGBColorMatrix(uint8* dst_argb, int dst_stride_argb, + const int8* matrix_argb, + int x, int y, int width, int height); + +// Apply a color table each ARGB pixel. +// Table contains 256 ARGB values. +LIBYUV_API +int ARGBColorTable(uint8* dst_argb, int dst_stride_argb, + const uint8* table_argb, + int x, int y, int width, int height); + +// Quantize a rectangle of ARGB. Alpha unaffected. +// scale is a 16 bit fractional fixed point scaler between 0 and 65535. +// interval_size should be a value between 1 and 255. +// interval_offset should be a value between 0 and 255. +LIBYUV_API +int ARGBQuantize(uint8* dst_argb, int dst_stride_argb, + int scale, int interval_size, int interval_offset, + int x, int y, int width, int height); + +// Copy ARGB to ARGB. +LIBYUV_API +int ARGBCopy(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +typedef void (*ARGBBlendRow)(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width); + +// Get function to Alpha Blend ARGB pixels and store to destination. +LIBYUV_API +ARGBBlendRow GetARGBBlend(); + +// Alpha Blend ARGB images and store to destination. +// Alpha of destination is set to 255. +LIBYUV_API +int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I422 to YUY2. +LIBYUV_API +int I422ToYUY2(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, + uint8* dst_frame, int dst_stride_frame, int width, int height); -// Convert I444 to ARGB. -int I444ToARGB(const uint8* src_y, int src_stride_y, +// Convert I422 to UYVY. +LIBYUV_API +int I422ToUYVY(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, + uint8* dst_frame, int dst_stride_frame, int width, int height); -// Convert I400 to ARGB. -int I400ToARGB(const uint8* src_y, int src_stride_y, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +// Convert unattentuated ARGB to preattenuated ARGB. +LIBYUV_API +int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); -// Convert I400 to ARGB. Reverse of ARGBToI400 -int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +// Convert preattentuated ARGB to unattenuated ARGB. +LIBYUV_API +int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); -// Convert RAW to ARGB. -int RAWToARGB(const uint8* src_raw, int src_stride_raw, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +// Convert MJPG to ARGB. +LIBYUV_API +int MJPGToARGB(const uint8* sample, size_t sample_size, + uint8* argb, int argb_stride, + int w, int h, int dw, int dh); -// Convert BG24 to ARGB. -int BG24ToARGB(const uint8* src_bg24, int src_stride_bg24, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +// Computes table of cumulative sum for image where the value is the sum +// of all values above and to the left of the entry. Used by ARGBBlur. +LIBYUV_API +int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb, + int32* dst_cumsum, int dst_stride32_cumsum, + int width, int height); -// Convert ABGR to ARGB. Also used for ARGB to ABGR. -int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +// Blur ARGB image. +// Caller should allocate dst_cumsum table of width * height * 16 bytes aligned +// to 16 byte boundary. +LIBYUV_API +int ARGBBlur(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int32* dst_cumsum, int dst_stride32_cumsum, + int width, int height, int radius); -// Convert BGRA to ARGB. Also used for ARGB to BGRA. -int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +// Multiply ARGB image by ARGB value. +LIBYUV_API +int ARGBShade(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height, uint32 value); -// Convert ARGB to I400. -int ARGBToI400(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - int width, int height); +// Interpolate between two ARGB images using specified amount of interpolation +// (0 to 255) and store to destination. +// 'interpolation' is specified as 8 bit fraction where 0 means 100% src_argb0 +// and 255 means 1% src_argb0 and 99% src_argb1. +// Internally uses ARGBScale bilinear filtering. +// Caveat: This function will write up to 16 bytes beyond the end of dst_argb. +LIBYUV_API +int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height, int interpolation); + +#if defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \ + defined(TARGET_IPHONE_SIMULATOR) +#define YUV_DISABLE_ASM +#endif +// Row functions for copying a pixels from a source with a slope to a row +// of destination. Useful for scaling, rotation, mirror, texture mapping. +LIBYUV_API +void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, + uint8* dst_argb, const float* uv_dudv, int width); +// The following are available on all x86 platforms: +#if !defined(YUV_DISABLE_ASM) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) +LIBYUV_API +void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, + uint8* dst_argb, const float* uv_dudv, int width); +#define HAS_ARGBAFFINEROW_SSE2 +#endif +#ifdef __cplusplus +} // extern "C" } // namespace libyuv +#endif -#endif // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ +#endif // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ NOLINT diff --git a/files/include/libyuv/rotate.h b/files/include/libyuv/rotate.h index 65c38de3..e7608a2d 100644 --- a/files/include/libyuv/rotate.h +++ b/files/include/libyuv/rotate.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -8,45 +8,103 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_ROTATE_H_ +#ifndef INCLUDE_LIBYUV_ROTATE_H_ // NOLINT #define INCLUDE_LIBYUV_ROTATE_H_ #include "libyuv/basic_types.h" +#ifdef __cplusplus namespace libyuv { +extern "C" { +#endif -// Supported rotation +// Supported rotation. enum RotationMode { - kRotate0 = 0, // No rotation - kRotate90 = 90, // Rotate 90 degrees clockwise - kRotate180 = 180, // Rotate 180 degrees - kRotate270 = 270, // Rotate 270 degrees clockwise + kRotate0 = 0, // No rotation. + kRotate90 = 90, // Rotate 90 degrees clockwise. + kRotate180 = 180, // Rotate 180 degrees. + kRotate270 = 270, // Rotate 270 degrees clockwise. - // Deprecated + // Deprecated. kRotateNone = 0, kRotateClockwise = 90, kRotateCounterClockwise = 270, }; -// Rotate I420 frame +// Rotate I420 frame. +LIBYUV_API int I420Rotate(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, - int width, int height, - RotationMode mode); + int src_width, int src_height, RotationMode mode); -// Rotate NV12 input and store in I420 +// Rotate NV12 input and store in I420. +LIBYUV_API int NV12ToI420Rotate(const uint8* src_y, int src_stride_y, const uint8* src_uv, int src_stride_uv, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, - int width, int height, - RotationMode mode); + int src_width, int src_height, RotationMode mode); +// Rotate planes by 90, 180, 270 +LIBYUV_API +void RotatePlane90(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height); + +LIBYUV_API +void RotatePlane180(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height); + +LIBYUV_API +void RotatePlane270(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height); + +LIBYUV_API +void RotateUV90(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height); + +// Rotations for when U and V are interleaved. +// These functions take one input pointer and +// split the data into two buffers while +// rotating them. +LIBYUV_API +void RotateUV180(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height); + +LIBYUV_API +void RotateUV270(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height); + +// The 90 and 270 functions are based on transposes. +// Doing a transpose with reversing the read/write +// order will result in a rotation by +- 90 degrees. +LIBYUV_API +void TransposePlane(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height); + +LIBYUV_API +void TransposeUV(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height); + +#ifdef __cplusplus +} // extern "C" } // namespace libyuv +#endif -#endif // INCLUDE_LIBYUV_ROTATE_H_ +#endif // INCLUDE_LIBYUV_ROTATE_H_ NOLINT diff --git a/files/include/libyuv/rotate_argb.h b/files/include/libyuv/rotate_argb.h new file mode 100644 index 00000000..a2781df3 --- /dev/null +++ b/files/include/libyuv/rotate_argb.h @@ -0,0 +1,33 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_ // NOLINT +#define INCLUDE_LIBYUV_ROTATE_ARGB_H_ + +#include "libyuv/basic_types.h" +#include "libyuv/rotate.h" // For RotationMode. + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Rotate ARGB frame +LIBYUV_API +int ARGBRotate(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int src_width, int src_height, RotationMode mode); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_ROTATE_ARGB_H_ NOLINT diff --git a/files/include/libyuv/row.h b/files/include/libyuv/row.h new file mode 100644 index 00000000..4814f254 --- /dev/null +++ b/files/include/libyuv/row.h @@ -0,0 +1,731 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_ROW_H_ // NOLINT +#define INCLUDE_LIBYUV_ROW_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// TODO(fbarchard): Remove kMaxStride +#define kMaxStride (2880 * 4) +#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1))) + +#if defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \ + defined(TARGET_IPHONE_SIMULATOR) +#define YUV_DISABLE_ASM +#endif +// True if compiling for SSSE3 as a requirement. +#if defined(__SSSE3__) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 3)) +#define LIBYUV_SSSE3_ONLY +#endif + +// The following are available on all x86 platforms: +#if !defined(YUV_DISABLE_ASM) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) +// Conversions. +#define HAS_ABGRTOARGBROW_SSSE3 +#define HAS_ABGRTOUVROW_SSSE3 +#define HAS_ABGRTOYROW_SSSE3 +#define HAS_ARGB1555TOARGBROW_SSE2 +#define HAS_ARGB4444TOARGBROW_SSE2 +#define HAS_ARGBTOARGB1555ROW_SSE2 +#define HAS_ARGBTOARGB4444ROW_SSE2 +#define HAS_ARGBTORAWROW_SSSE3 +#define HAS_ARGBTORGB24ROW_SSSE3 +#define HAS_ARGBTORGB565ROW_SSE2 +#define HAS_ARGBTORGBAROW_SSSE3 +#define HAS_ARGBTOUVROW_SSSE3 +#define HAS_ARGBTOYROW_SSSE3 +#define HAS_BGRATOARGBROW_SSSE3 +#define HAS_BGRATOUVROW_SSSE3 +#define HAS_BGRATOYROW_SSSE3 +#define HAS_COPYROW_SSE2 +#define HAS_COPYROW_X86 +#define HAS_I400TOARGBROW_SSE2 +#define HAS_I411TOARGBROW_SSSE3 +#define HAS_I422TOABGRROW_SSSE3 +#define HAS_I422TOARGBROW_SSSE3 +#define HAS_I422TOBGRAROW_SSSE3 +#define HAS_I444TOARGBROW_SSSE3 +#define HAS_MIRRORROW_SSSE3 +#define HAS_MIRRORROWUV_SSSE3 +#define HAS_NV12TOARGBROW_SSSE3 +#define HAS_NV21TOARGBROW_SSSE3 +#define HAS_RAWTOARGBROW_SSSE3 +#define HAS_RGB24TOARGBROW_SSSE3 +#define HAS_RGB565TOARGBROW_SSE2 +#define HAS_SETROW_X86 +#define HAS_SPLITUV_SSE2 +#define HAS_UYVYTOUV422ROW_SSE2 +#define HAS_UYVYTOUVROW_SSE2 +#define HAS_UYVYTOYROW_SSE2 +#define HAS_YTOARGBROW_SSE2 +#define HAS_YUY2TOUV422ROW_SSE2 +#define HAS_YUY2TOUVROW_SSE2 +#define HAS_YUY2TOYROW_SSE2 + +// Effects +#define HAS_ARGBAFFINEROW_SSE2 +#define HAS_ARGBATTENUATEROW_SSSE3 +#define HAS_ARGBBLENDROW_SSSE3 +#define HAS_ARGBCOLORMATRIXROW_SSSE3 +#define HAS_ARGBGRAYROW_SSSE3 +#define HAS_ARGBINTERPOLATEROW_SSSE3 +#define HAS_ARGBMIRRORROW_SSSE3 +#define HAS_ARGBQUANTIZEROW_SSE2 +#define HAS_ARGBSEPIAROW_SSSE3 +#define HAS_ARGBSHADE_SSE2 +#define HAS_ARGBUNATTENUATEROW_SSE2 +#define HAS_COMPUTECUMULATIVESUMROW_SSE2 +#define HAS_CUMULATIVESUMTOAVERAGE_SSE2 +#endif + +// The following are Windows only: +#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) +#define HAS_ABGRTOARGBROW_SSSE3 +#define HAS_ARGBCOLORTABLEROW_X86 +#define HAS_I422TORGBAROW_SSSE3 +#define HAS_RGBATOARGBROW_SSSE3 +#define HAS_RGBATOUVROW_SSSE3 +#define HAS_RGBATOYROW_SSSE3 +#endif + +// The following are disabled when SSSE3 is available: +#if !defined(YUV_DISABLE_ASM) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \ + !defined(LIBYUV_SSSE3_ONLY) +#define HAS_ARGBATTENUATE_SSE2 +#define HAS_ARGBBLENDROW_SSE2 +#define HAS_MIRRORROW_SSE2 +#endif + +// The following are available on Neon platforms +#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) +#define HAS_COPYROW_NEON +#define HAS_I422TOABGRROW_NEON +#define HAS_I422TOARGBROW_NEON +#define HAS_I422TOBGRAROW_NEON +#define HAS_I422TORAWROW_NEON +#define HAS_I422TORGB24ROW_NEON +#define HAS_I422TORGBAROW_NEON +#define HAS_MIRRORROW_NEON +#define HAS_MIRRORROWUV_NEON +#define HAS_SETROW_NEON +#define HAS_SPLITUV_NEON +#define HAS_UYVYTOUV422ROW_NEON +#define HAS_UYVYTOUVROW_NEON +#define HAS_UYVYTOYROW_NEON +#define HAS_YUY2TOUV422ROW_NEON +#define HAS_YUY2TOUVROW_NEON +#define HAS_YUY2TOYROW_NEON + +// TODO(fbarchard): Hook these up to calling functions. +#define HAS_ABGRTOARGBROW_NEON +#define HAS_ARGBTORAWROW_NEON +#define HAS_ARGBTORGB24ROW_NEON +#define HAS_ARGBTORGBAROW_NEON +#define HAS_BGRATOARGBROW_NEON +#define HAS_NV12TOARGBROW_NEON +#define HAS_NV21TOARGBROW_NEON +#define HAS_RAWTOARGBROW_NEON +#define HAS_RGB24TOARGBROW_NEON +#define HAS_RGBATOARGBROW_NEON +#endif + +#if defined(_MSC_VER) && !defined(__CLR_VER) +#define SIMD_ALIGNED(var) __declspec(align(16)) var +typedef __declspec(align(16)) int8 vec8[16]; +typedef __declspec(align(16)) uint8 uvec8[16]; +typedef __declspec(align(16)) int16 vec16[8]; +typedef __declspec(align(16)) uint16 uvec16[8]; +typedef __declspec(align(16)) int32 vec32[4]; +typedef __declspec(align(16)) uint32 uvec32[4]; +#elif defined(__GNUC__) +#define SIMD_ALIGNED(var) var __attribute__((aligned(16))) +typedef int8 __attribute__((vector_size(16))) vec8; +typedef uint8 __attribute__((vector_size(16))) uvec8; +typedef int16 __attribute__((vector_size(16))) vec16; +typedef uint16 __attribute__((vector_size(16))) uvec16; +typedef int32 __attribute__((vector_size(16))) vec32; +typedef uint32 __attribute__((vector_size(16))) uvec32; +#else +#define SIMD_ALIGNED(var) var +typedef int8 vec8[16]; +typedef uint8 uvec8[16]; +typedef int16 vec16[8]; +typedef uint16 uvec16[8]; +typedef int32 vec32[4]; +typedef uint32 uvec32[4]; +#endif + +#if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__) +#define OMITFP +#else +#define OMITFP __attribute__((optimize("omit-frame-pointer"))) +#endif + +void I422ToARGBRow_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +void I422ToBGRARow_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +void I422ToABGRRow_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +void I422ToRGBARow_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +void I422ToRGB24Row_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +void I422ToRAWRow_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +void NV12ToARGBRow_NEON(const uint8* y_buf, + const uint8* uv_buf, + uint8* rgb_buf, + int width); +void NV21ToARGBRow_NEON(const uint8* y_buf, + const uint8* uv_buf, + uint8* rgb_buf, + int width); + +void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); + +void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); + +void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width); +void MirrorRow_SSE2(const uint8* src, uint8* dst, int width); +void MirrorRow_NEON(const uint8* src, uint8* dst, int width); +void MirrorRow_C(const uint8* src, uint8* dst, int width); + +void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, int width); +void MirrorRowUV_NEON(const uint8* src, uint8* dst_u, uint8* dst_v, int width); +void MirrorRowUV_C(const uint8* src, uint8* dst_u, uint8* dst_v, int width); + +void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width); +void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width); + +void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); +void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); +void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); + +void CopyRow_SSE2(const uint8* src, uint8* dst, int count); +void CopyRow_X86(const uint8* src, uint8* dst, int count); +void CopyRow_NEON(const uint8* src, uint8* dst, int count); +void CopyRow_C(const uint8* src, uint8* dst, int count); + +void SetRow8_X86(uint8* dst, uint32 v32, int count); +void SetRows32_X86(uint8* dst, uint32 v32, int width, + int dst_stride, int height); +void SetRow8_NEON(uint8* dst, uint32 v32, int count); +void SetRows32_NEON(uint8* dst, uint32 v32, int width, + int dst_stride, int height); +void SetRow8_C(uint8* dst, uint32 v32, int count); +void SetRows32_C(uint8* dst, uint32 v32, int width, int dst_stride, int height); + +void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); +void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); +void ABGRToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); +void RGBAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); + +void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void BGRAToUVRow_C(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ABGRToUVRow_C(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void RGBAToUVRow_C(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); + +void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix); +void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix); +void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix); +void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix); +void RAWToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix); +void ARGB1555ToARGBRow_SSE2(const uint8* src_argb, uint8* dst_argb, int pix); +void RGB565ToARGBRow_SSE2(const uint8* src_argb, uint8* dst_argb, int pix); +void ARGB4444ToARGBRow_SSE2(const uint8* src_argb, uint8* dst_argb, int pix); + +void BGRAToARGBRow_NEON(const uint8* src_bgra, uint8* dst_argb, int pix); +void ABGRToARGBRow_NEON(const uint8* src_abgr, uint8* dst_argb, int pix); +void RGBAToARGBRow_NEON(const uint8* src_rgba, uint8* dst_argb, int pix); +void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix); +void RAWToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix); + +void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix); +void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix); +void RGBAToARGBRow_C(const uint8* src_rgba, uint8* dst_argb, int pix); +void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix); +void RAWToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix); +void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix); +void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix); +void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix); + +void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); + +void ARGBToRGBARow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); + +void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); + +void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix); +void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix); + +void I444ToARGBRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width); + +void I422ToARGBRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width); + +void I411ToARGBRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void NV12ToARGBRow_C(const uint8* y_buf, + const uint8* uv_buf, + uint8* argb_buf, + int width); + +void NV21ToARGBRow_C(const uint8* y_buf, + const uint8* vu_buf, + uint8* argb_buf, + int width); + +void I422ToBGRARow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* bgra_buf, + int width); + +void I422ToABGRRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* abgr_buf, + int width); + +void I422ToRGBARow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgba_buf, + int width); +void I422ToRGB24Row_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb24_buf, + int width); +void I422ToRAWRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* raw_buf, + int width); + +void YToARGBRow_C(const uint8* y_buf, + uint8* rgb_buf, + int width); + +void I444ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width); + +void I422ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width); + +void I411ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void NV12ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* argb_buf, + int width); + +void NV21ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* vu_buf, + uint8* argb_buf, + int width); + +void I422ToBGRARow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* bgra_buf, + int width); + +void I422ToABGRRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* abgr_buf, + int width); + +void I422ToRGBARow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgba_buf, + int width); + +void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width); + +void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width); + +void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* argb_buf, + int width); + +void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* vu_buf, + uint8* argb_buf, + int width); + +void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* bgra_buf, + int width); + +void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* abgr_buf, + int width); + +void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgba_buf, + int width); + +void I444ToARGBRow_Any_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width); + +void I422ToARGBRow_Any_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width); + +void I411ToARGBRow_Any_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void NV12ToARGBRow_Any_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* argb_buf, + int width); + +void NV21ToARGBRow_Any_SSSE3(const uint8* y_buf, + const uint8* vu_buf, + uint8* argb_buf, + int width); + +void I422ToBGRARow_Any_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* bgra_buf, + int width); + +void I422ToABGRRow_Any_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* abgr_buf, + int width); + +void I422ToRGBARow_Any_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgba_buf, + int width); + +void YToARGBRow_SSE2(const uint8* y_buf, + uint8* argb_buf, + int width); + +// ARGB preattenuated alpha blend. +void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width); + +void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); + +void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); + +void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void BGRAToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void ABGRToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void RGBAToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void BGRAToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ABGRToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void RGBAToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void I422ToARGBRow_Any_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +void I422ToBGRARow_Any_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +void I422ToABGRRow_Any_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +void I422ToRGBARow_Any_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +void I422ToRGB24Row_Any_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +void I422ToRAWRow_Any_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +void NV12ToARGBRow_Any_NEON(const uint8* y_buf, + const uint8* uv_buf, + uint8* argb_buf, + int width); +void NV21ToARGBRow_Any_NEON(const uint8* y_buf, + const uint8* uv_buf, + uint8* argb_buf, + int width); + +void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix); +void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, + uint8* dst_y, int pix); +void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix); +void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_NEON(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int pix); +void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_C(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix); +void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_Any_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToYRow_Any_NEON(const uint8* src_yuy2, uint8* dst_y, int pix); +void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); + +void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, + uint8* dst_y, int pix); +void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_NEON(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); + +void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_C(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_Any_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToYRow_Any_NEON(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); + +void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width); + +// Inverse table for unattenuate, shared by C and SSE2. +extern uint32 fixed_invtbl8[256]; +void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); + +void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width); + +void ARGBSepiaRow_C(uint8* dst_argb, int width); +void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width); + +void ARGBColorMatrixRow_C(uint8* dst_argb, const int8* matrix_argb, int width); +void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb, + int width); + +void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width); +void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width); + +void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size, + int interval_offset, int width); +void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, + int interval_offset, int width); + +// Used for blur. +void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft, + int width, int area, uint8* dst, int count); +void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, + const int32* previous_cumsum, int width); + +void CumulativeSumToAverage_C(const int32* topleft, const int32* botleft, + int width, int area, uint8* dst, int count); +void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum, + const int32* previous_cumsum, int width); + +void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value); +void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value); + +LIBYUV_API +void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, + uint8* dst_argb, const float* uv_dudv, int width); +LIBYUV_API +void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, + uint8* dst_argb, const float* uv_dudv, int width); + +void ARGBInterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, + int dst_width, int source_y_fraction); +void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_ROW_H_ NOLINT + diff --git a/files/include/libyuv/scale.h b/files/include/libyuv/scale.h index 8433908b..18098798 100644 --- a/files/include/libyuv/scale.h +++ b/files/include/libyuv/scale.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -8,20 +8,31 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_SCALE_H_ +#ifndef INCLUDE_LIBYUV_SCALE_H_ // NOLINT #define INCLUDE_LIBYUV_SCALE_H_ #include "libyuv/basic_types.h" +#ifdef __cplusplus namespace libyuv { +extern "C" { +#endif // Supported filtering enum FilterMode { - kFilterNone = 0, // Point sample; Fastest + kFilterNone = 0, // Point sample; Fastest. kFilterBilinear = 1, // Faster than box, but lower quality scaling down. - kFilterBox = 2 // Highest quality + kFilterBox = 2 // Highest quality. }; +// Scale a YUV plane. +LIBYUV_API +void ScalePlane(const uint8* src, int src_stride, + int src_width, int src_height, + uint8* dst, int dst_stride, + int dst_width, int dst_height, + FilterMode filtering); + // Scales a YUV 4:2:0 image from the src width and height to the // dst width and height. // If filtering is kFilterNone, a simple nearest-neighbor algorithm is @@ -32,6 +43,7 @@ enum FilterMode { // quality image, at further expense of speed. // Returns 0 if successful. +LIBYUV_API int I420Scale(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, @@ -42,15 +54,8 @@ int I420Scale(const uint8* src_y, int src_stride_y, int dst_width, int dst_height, FilterMode filtering); -// Legacy API -// If dst_height_offset is non-zero, the image is offset by that many pixels -// and stretched to (dst_height - dst_height_offset * 2) pixels high, -// instead of dst_height. -int Scale(const uint8* src, int src_width, int src_height, - uint8* dst, int dst_width, int dst_height, int dst_height_offset, - bool interpolate); - -// Same, but specified src terms of each plane location and stride. +// Legacy API. Deprecated. +LIBYUV_API int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v, int src_stride_y, int src_stride_u, int src_stride_v, int src_width, int src_height, @@ -59,9 +64,19 @@ int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v, int dst_width, int dst_height, bool interpolate); -// For testing, allow disabling of optimizations. +// Legacy API. Deprecated. +LIBYUV_API +int ScaleOffset(const uint8* src, int src_width, int src_height, + uint8* dst, int dst_width, int dst_height, int dst_yoffset, + bool interpolate); + +// For testing, allow disabling of specialized scalers. +LIBYUV_API void SetUseReferenceImpl(bool use); -} // namespace libyuv +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif -#endif // INCLUDE_LIBYUV_SCALE_H_ +#endif // INCLUDE_LIBYUV_SCALE_H_ NOLINT diff --git a/files/include/libyuv/scale_argb.h b/files/include/libyuv/scale_argb.h new file mode 100644 index 00000000..1af0e1dc --- /dev/null +++ b/files/include/libyuv/scale_argb.h @@ -0,0 +1,34 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_ // NOLINT +#define INCLUDE_LIBYUV_SCALE_ARGB_H_ + +#include "libyuv/basic_types.h" +#include "libyuv/scale.h" // For FilterMode + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +LIBYUV_API +int ARGBScale(const uint8* src_argb, int src_stride_argb, + int src_width, int src_height, + uint8* dst_argb, int dst_stride_argb, + int dst_width, int dst_height, + FilterMode filtering); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_SCALE_ARGB_H_ NOLINT diff --git a/files/include/libyuv/version.h b/files/include/libyuv/version.h new file mode 100644 index 00000000..e782ae18 --- /dev/null +++ b/files/include/libyuv/version.h @@ -0,0 +1,16 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT +#define INCLUDE_LIBYUV_VERSION_H_ + +#define LIBYUV_VERSION 397 + +#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/files/include/libyuv/video_common.h b/files/include/libyuv/video_common.h new file mode 100644 index 00000000..5d812c98 --- /dev/null +++ b/files/include/libyuv/video_common.h @@ -0,0 +1,159 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// Common definitions for video, including fourcc and VideoFormat. + +#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_ // NOLINT +#define INCLUDE_LIBYUV_VIDEO_COMMON_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +////////////////////////////////////////////////////////////////////////////// +// Definition of FourCC codes +////////////////////////////////////////////////////////////////////////////// + +// Convert four characters to a FourCC code. +// Needs to be a macro otherwise the OS X compiler complains when the kFormat* +// constants are used in a switch. +#define FOURCC(a, b, c, d) ( \ + (static_cast<uint32>(a)) | (static_cast<uint32>(b) << 8) | \ + (static_cast<uint32>(c) << 16) | (static_cast<uint32>(d) << 24)) + +// Some pages discussing FourCC codes: +// http://www.fourcc.org/yuv.php +// http://v4l2spec.bytesex.org/spec/book1.htm +// http://developer.apple.com/quicktime/icefloe/dispatch020.html +// http://msdn.microsoft.com/library/windows/desktop/dd206750.aspx#nv12 +// http://people.xiph.org/~xiphmont/containers/nut/nut4cc.txt + +enum FourCC { + // Canonical fourcc codes used in our code. + FOURCC_I420 = FOURCC('I', '4', '2', '0'), + FOURCC_I422 = FOURCC('I', '4', '2', '2'), + FOURCC_I444 = FOURCC('I', '4', '4', '4'), + FOURCC_I411 = FOURCC('I', '4', '1', '1'), + FOURCC_I400 = FOURCC('I', '4', '0', '0'), + FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'), // Linux version of I420. + FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'), + FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'), + FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'), + FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'), + FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'), + FOURCC_M420 = FOURCC('M', '4', '2', '0'), + FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), + FOURCC_V210 = FOURCC('V', '2', '1', '0'), + FOURCC_24BG = FOURCC('2', '4', 'B', 'G'), + FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'), + FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'), + FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'), + FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'), + FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'), // bgr565. + FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'), // abgr1555. + FOURCC_R444 = FOURCC('R', '4', '4', '4'), // argb4444. + FOURCC_RAW = FOURCC('r', 'a', 'w', ' '), + FOURCC_NV21 = FOURCC('N', 'V', '2', '1'), + FOURCC_NV12 = FOURCC('N', 'V', '1', '2'), + FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'), + FOURCC_H264 = FOURCC('H', '2', '6', '4'), + // Next four are Bayer RGB formats. The four characters define the order of + // the colours in each 2x2 pixel grid, going left-to-right and top-to-bottom. + FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'), + FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'), + FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'), + FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'), + + // Aliases for canonical fourcc codes, replaced with their canonical + // equivalents by CanonicalFourCC(). + FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420. + FOURCC_YU16 = FOURCC('Y', 'U', '1', '6'), // Alias for I422. + FOURCC_YU24 = FOURCC('Y', 'U', '2', '4'), // Alias for I444. + FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'), // Alias for YUY2. + FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'), // Alias for YUY2 on Mac. + FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'), // Alias for UYVY. + FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'), // Alias for UYVY. + FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'), // Alias for MJPG. + FOURCC_DMB1 = FOURCC('d', 'm', 'b', '1'), // Alias for MJPG on Mac. + FOURCC_BA81 = FOURCC('B', 'A', '8', '1'), // Alias for BGGR. + FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'), // Alias for RAW. + FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'), // Alias for 24BG. + + // Match any fourcc. + FOURCC_ANY = 0xFFFFFFFF, +}; + +enum FourCCBpp { + // Canonical fourcc codes used in our code. + FOURCC_BPP_I420 = 12, + FOURCC_BPP_I422 = 16, + FOURCC_BPP_I444 = 24, + FOURCC_BPP_I411 = 12, + FOURCC_BPP_I400 = 8, + FOURCC_BPP_YU12 = 12, + FOURCC_BPP_YV12 = 12, + FOURCC_BPP_YV16 = 16, + FOURCC_BPP_YV24 = 24, + FOURCC_BPP_YUY2 = 16, + FOURCC_BPP_UYVY = 16, + FOURCC_BPP_M420 = 12, + FOURCC_BPP_Q420 = 12, + FOURCC_BPP_V210 = 22, // 128 / 6 actually. + FOURCC_BPP_24BG = 24, + FOURCC_BPP_ARGB = 32, + FOURCC_BPP_BGRA = 32, + FOURCC_BPP_ABGR = 32, + FOURCC_BPP_RGBA = 32, + FOURCC_BPP_RGBP = 16, + FOURCC_BPP_RGBO = 16, + FOURCC_BPP_R444 = 16, + FOURCC_BPP_RAW = 24, + FOURCC_BPP_NV21 = 12, + FOURCC_BPP_NV12 = 12, + FOURCC_BPP_MJPG = 0, // 0 means unknown. + FOURCC_BPP_H264 = 0, + // Next four are Bayer RGB formats. The four characters define the order of + // the colours in each 2x2 pixel grid, going left-to-right and top-to-bottom. + FOURCC_BPP_RGGB = 8, + FOURCC_BPP_BGGR = 8, + FOURCC_BPP_GRBG = 8, + FOURCC_BPP_GBRG = 8, + + // Aliases for canonical fourcc codes, replaced with their canonical + // equivalents by CanonicalFourCC(). + FOURCC_BPP_IYUV = 12, + FOURCC_BPP_YU16 = 16, + FOURCC_BPP_YU24 = 24, + FOURCC_BPP_YUYV = 16, + FOURCC_BPP_YUVS = 16, + FOURCC_BPP_HDYC = 16, + FOURCC_BPP_2VUY = 16, + FOURCC_BPP_JPEG = 1, + FOURCC_BPP_DMB1 = 1, + FOURCC_BPP_BA81 = 8, + FOURCC_BPP_RGB3 = 24, + FOURCC_BPP_BGR3 = 24, + + // Match any fourcc. + FOURCC_BPP_ANY = 0, // 0 means unknown. +}; + +// Converts fourcc aliases into canonical ones. +LIBYUV_API uint32 CanonicalFourCC(uint32 fourcc); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_VIDEO_COMMON_H_ NOLINT diff --git a/files/libyuv.gyp b/files/libyuv.gyp index d5abab73..18137538 100644 --- a/files/libyuv.gyp +++ b/files/libyuv.gyp @@ -1,4 +1,4 @@ -# Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. +# Copyright 2011 The LibYuv Project Authors. All rights reserved. # # Use of this source code is governed by a BSD-style license # that can be found in the LICENSE file in the root of the source @@ -7,60 +7,85 @@ # be found in the AUTHORS file in the root of the source tree. { + 'variables': { + 'use_system_libjpeg%': 0, + }, 'targets': [ { 'target_name': 'libyuv', 'type': 'static_library', + # 'type': 'shared_library', + 'conditions': [ + ['use_system_libjpeg==0', { + 'dependencies': [ + '<(DEPTH)/third_party/libjpeg_turbo/libjpeg.gyp:libjpeg', + ], + }, { + 'link_settings': { + 'libraries': [ + '-ljpeg', + ], + }, + }], + ], + 'defines': [ + 'HAVE_JPEG', + # 'LIBYUV_BUILDING_SHARED_LIBRARY', + ], 'include_dirs': [ - 'common', 'include', + '.', ], 'direct_dependent_settings': { 'include_dirs': [ - 'common', 'include', + '.', ], }, 'sources': [ - # includes - 'include/convert.h', - 'include/general.h', - 'include/scale.h', - 'include/planar_functions.h', + # includes. + 'include/libyuv.h', + 'include/libyuv/basic_types.h', + 'include/libyuv/compare.h', + 'include/libyuv/convert.h', + 'include/libyuv/convert_argb.h', + 'include/libyuv/convert_from.h', + 'include/libyuv/cpu_id.h', + 'include/libyuv/format_conversion.h', + 'include/libyuv/mjpeg_decoder.h', + 'include/libyuv/planar_functions.h', + 'include/libyuv/rotate.h', + 'include/libyuv/rotate_argb.h', + 'include/libyuv/row.h', + 'include/libyuv/scale.h', + 'include/libyuv/scale_argb.h', + 'include/libyuv/version.h', + 'include/libyuv/video_common.h', - # headers - 'common/basic_types.h', - 'common/common.h', - 'common/constructor_magic.h', - 'source/cpu_id.h', - 'source/rotate.h' - 'source/row.h', - 'source/video_common.h', - - # sources + # sources. + 'source/compare.cc', + 'source/compare_neon.cc', 'source/convert.cc', + 'source/convert_argb.cc', + 'source/convert_from.cc', 'source/cpu_id.cc', 'source/format_conversion.cc', - 'source/general.cc', + 'source/mjpeg_decoder.cc', 'source/planar_functions.cc', 'source/rotate.cc', - 'source/row_table.cc', + 'source/rotate_argb.cc', + 'source/rotate_neon.cc', + 'source/row_common.cc', + 'source/row_neon.cc', + 'source/row_posix.cc', + 'source/row_win.cc', 'source/scale.cc', + 'source/scale_neon.cc', + 'source/scale_argb.cc', 'source/video_common.cc', ], - 'conditions': [ - ['OS=="win"', { - 'sources': [ - 'source/row_win.cc', - ], - },{ # else - 'sources': [ - 'source/row_posix.cc', - ], - }], - ] }, - ], # targets + ], # targets. } # Local Variables: diff --git a/files/libyuv_test.gyp b/files/libyuv_test.gyp new file mode 100755 index 00000000..27cec8f4 --- /dev/null +++ b/files/libyuv_test.gyp @@ -0,0 +1,74 @@ +# Copyright 2011 The LibYuv Project Authors. All rights reserved. +# +# Use of this source code is governed by a BSD-style license +# that can be found in the LICENSE file in the root of the source +# tree. An additional intellectual property rights grant can be found +# in the file PATENTS. All contributing project authors may +# be found in the AUTHORS file in the root of the source tree. + +{ + 'targets': [ + { + 'target_name': 'libyuv_unittest', + 'type': 'executable', + 'dependencies': [ + 'libyuv.gyp:libyuv', + # The tests are based on gtest + 'testing/gtest.gyp:gtest', + 'testing/gtest.gyp:gtest_main', + ], + 'defines': [ + 'LIBYUV_SVNREVISION="<!(svnversion -n)"', + # 'LIBYUV_USING_SHARED_LIBRARY', + ], + 'sources': [ + # headers + 'unit_test/unit_test.h', + + # sources + 'unit_test/compare_test.cc', + 'unit_test/cpu_test.cc', + 'unit_test/planar_test.cc', + 'unit_test/rotate_argb_test.cc', + 'unit_test/rotate_test.cc', + 'unit_test/scale_argb_test.cc', + 'unit_test/scale_test.cc', + 'unit_test/unit_test.cc', + 'unit_test/version_test.cc', + ], + 'conditions': [ + ['OS=="linux"', { + 'cflags': [ + '-fexceptions', + ], + }], + ], # conditions + }, + + { + 'target_name': 'compare', + 'type': 'executable', + 'dependencies': [ + 'libyuv.gyp:libyuv', + ], + 'sources': [ + # sources + 'util/compare.cc', + ], + 'conditions': [ + ['OS=="linux"', { + 'cflags': [ + '-fexceptions', + ], + }], + ], # conditions + }, + + ], # targets +} + +# Local Variables: +# tab-width:2 +# indent-tabs-mode:nil +# End: +# vim: set expandtab tabstop=2 shiftwidth=2: diff --git a/files/source/compare.cc b/files/source/compare.cc new file mode 100644 index 00000000..bf4a7dae --- /dev/null +++ b/files/source/compare.cc @@ -0,0 +1,571 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/compare.h" + +#include <float.h> +#include <math.h> +#ifdef _OPENMP +#include <omp.h> +#endif + +#include "libyuv/basic_types.h" +#include "libyuv/cpu_id.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// hash seed of 5381 recommended. +// Internal C version of HashDjb2 with int sized count for efficiency. +static uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) { + uint32 hash = seed; + for (int i = 0; i < count; ++i) { + hash += (hash << 5) + src[i]; + } + return hash; +} + +// This module is for Visual C x86 +#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) +#define HAS_HASHDJB2_SSE41 +static const uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16 +static const uvec32 kHashMul0 = { + 0x0c3525e1, // 33 ^ 15 + 0xa3476dc1, // 33 ^ 14 + 0x3b4039a1, // 33 ^ 13 + 0x4f5f0981, // 33 ^ 12 +}; +static const uvec32 kHashMul1 = { + 0x30f35d61, // 33 ^ 11 + 0x855cb541, // 33 ^ 10 + 0x040a9121, // 33 ^ 9 + 0x747c7101, // 33 ^ 8 +}; +static const uvec32 kHashMul2 = { + 0xec41d4e1, // 33 ^ 7 + 0x4cfa3cc1, // 33 ^ 6 + 0x025528a1, // 33 ^ 5 + 0x00121881, // 33 ^ 4 +}; +static const uvec32 kHashMul3 = { + 0x00008c61, // 33 ^ 3 + 0x00000441, // 33 ^ 2 + 0x00000021, // 33 ^ 1 + 0x00000001, // 33 ^ 0 +}; + +// 27: 66 0F 38 40 C6 pmulld xmm0,xmm6 +// 44: 66 0F 38 40 DD pmulld xmm3,xmm5 +// 59: 66 0F 38 40 E5 pmulld xmm4,xmm5 +// 72: 66 0F 38 40 D5 pmulld xmm2,xmm5 +// 83: 66 0F 38 40 CD pmulld xmm1,xmm5 +#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \ + _asm _emit 0x40 _asm _emit reg + +__declspec(naked) __declspec(align(16)) +static uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { + __asm { + mov eax, [esp + 4] // src + mov ecx, [esp + 8] // count + movd xmm0, [esp + 12] // seed + + pxor xmm7, xmm7 // constant 0 for unpck + movdqa xmm6, kHash16x33 + + align 16 + wloop: + movdqu xmm1, [eax] // src[0-15] + lea eax, [eax + 16] + pmulld(0xc6) // pmulld xmm0,xmm6 hash *= 33 ^ 16 + movdqa xmm5, kHashMul0 + movdqa xmm2, xmm1 + punpcklbw xmm2, xmm7 // src[0-7] + movdqa xmm3, xmm2 + punpcklwd xmm3, xmm7 // src[0-3] + pmulld(0xdd) // pmulld xmm3, xmm5 + movdqa xmm5, kHashMul1 + movdqa xmm4, xmm2 + punpckhwd xmm4, xmm7 // src[4-7] + pmulld(0xe5) // pmulld xmm4, xmm5 + movdqa xmm5, kHashMul2 + punpckhbw xmm1, xmm7 // src[8-15] + movdqa xmm2, xmm1 + punpcklwd xmm2, xmm7 // src[8-11] + pmulld(0xd5) // pmulld xmm2, xmm5 + movdqa xmm5, kHashMul3 + punpckhwd xmm1, xmm7 // src[12-15] + pmulld(0xcd) // pmulld xmm1, xmm5 + paddd xmm3, xmm4 // add 16 results + paddd xmm1, xmm2 + sub ecx, 16 + paddd xmm1, xmm3 + + pshufd xmm2, xmm1, 14 // upper 2 dwords + paddd xmm1, xmm2 + pshufd xmm2, xmm1, 1 + paddd xmm1, xmm2 + paddd xmm0, xmm1 + jg wloop + + movd eax, xmm0 // return hash + ret + } +} + +#elif !defined(YUV_DISABLE_ASM) && \ + (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))) +// GCC 4.2 on OSX has link error when passing static or const to inline. +// TODO(fbarchard): Use static const when gcc 4.2 support is dropped. +#ifdef __APPLE__ +#define CONST +#else +#define CONST static const +#endif +#define HAS_HASHDJB2_SSE41 +CONST uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16 +CONST uvec32 kHashMul0 = { + 0x0c3525e1, // 33 ^ 15 + 0xa3476dc1, // 33 ^ 14 + 0x3b4039a1, // 33 ^ 13 + 0x4f5f0981, // 33 ^ 12 +}; +CONST uvec32 kHashMul1 = { + 0x30f35d61, // 33 ^ 11 + 0x855cb541, // 33 ^ 10 + 0x040a9121, // 33 ^ 9 + 0x747c7101, // 33 ^ 8 +}; +CONST uvec32 kHashMul2 = { + 0xec41d4e1, // 33 ^ 7 + 0x4cfa3cc1, // 33 ^ 6 + 0x025528a1, // 33 ^ 5 + 0x00121881, // 33 ^ 4 +}; +CONST uvec32 kHashMul3 = { + 0x00008c61, // 33 ^ 3 + 0x00000441, // 33 ^ 2 + 0x00000021, // 33 ^ 1 + 0x00000001, // 33 ^ 0 +}; +static uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { + uint32 hash; + asm volatile ( + "movd %2,%%xmm0 \n" + "pxor %%xmm7,%%xmm7 \n" + "movdqa %4,%%xmm6 \n" + ".p2align 4 \n" + "1: \n" + "movdqu (%0),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "pmulld %%xmm6,%%xmm0 \n" + "movdqa %5,%%xmm5 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm7,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm7,%%xmm3 \n" + "pmulld %%xmm5,%%xmm3 \n" + "movdqa %6,%%xmm5 \n" + "movdqa %%xmm2,%%xmm4 \n" + "punpckhwd %%xmm7,%%xmm4 \n" + "pmulld %%xmm5,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "punpckhbw %%xmm7,%%xmm1 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklwd %%xmm7,%%xmm2 \n" + "pmulld %%xmm5,%%xmm2 \n" + "movdqa %8,%%xmm5 \n" + "punpckhwd %%xmm7,%%xmm1 \n" + "pmulld %%xmm5,%%xmm1 \n" + "paddd %%xmm4,%%xmm3 \n" + "paddd %%xmm2,%%xmm1 \n" + "sub $0x10,%1 \n" + "paddd %%xmm3,%%xmm1 \n" + "pshufd $0xe,%%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm1 \n" + "pshufd $0x1,%%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "jg 1b \n" + "movd %%xmm0,%3 \n" + : "+r"(src), // %0 + "+r"(count), // %1 + "+rm"(seed), // %2 + "=g"(hash) // %3 + : "m"(kHash16x33), // %4 + "m"(kHashMul0), // %5 + "m"(kHashMul1), // %6 + "m"(kHashMul2), // %7 + "m"(kHashMul3) // %8 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); + return hash; +} +#endif // HAS_HASHDJB2_SSE41 + +// hash seed of 5381 recommended. +LIBYUV_API +uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) { + uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C; +#if defined(HAS_HASHDJB2_SSE41) + if (TestCpuFlag(kCpuHasSSE41)) { + HashDjb2_SSE = HashDjb2_SSE41; + } +#endif + + const int kBlockSize = 1 << 15; // 32768; + while (count >= static_cast<uint64>(kBlockSize)) { + seed = HashDjb2_SSE(src, kBlockSize, seed); + src += kBlockSize; + count -= kBlockSize; + } + int remainder = static_cast<int>(count) & ~15; + if (remainder) { + seed = HashDjb2_SSE(src, remainder, seed); + src += remainder; + count -= remainder; + } + remainder = static_cast<int>(count) & 15; + if (remainder) { + seed = HashDjb2_C(src, remainder, seed); + } + return seed; +} + +#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) +#define HAS_SUMSQUAREERROR_NEON + +uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count); + +#elif !defined(YUV_DISABLE_ASM) && defined(_M_IX86) +#define HAS_SUMSQUAREERROR_SSE2 +__declspec(naked) __declspec(align(16)) +static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, + int count) { + __asm { + mov eax, [esp + 4] // src_a + mov edx, [esp + 8] // src_b + mov ecx, [esp + 12] // count + pxor xmm0, xmm0 + pxor xmm5, xmm5 + sub edx, eax + + align 16 + wloop: + movdqa xmm1, [eax] + movdqa xmm2, [eax + edx] + lea eax, [eax + 16] + sub ecx, 16 + movdqa xmm3, xmm1 // abs trick + psubusb xmm1, xmm2 + psubusb xmm2, xmm3 + por xmm1, xmm2 + movdqa xmm2, xmm1 + punpcklbw xmm1, xmm5 + punpckhbw xmm2, xmm5 + pmaddwd xmm1, xmm1 + pmaddwd xmm2, xmm2 + paddd xmm0, xmm1 + paddd xmm0, xmm2 + jg wloop + + pshufd xmm1, xmm0, 0EEh + paddd xmm0, xmm1 + pshufd xmm1, xmm0, 01h + paddd xmm0, xmm1 + movd eax, xmm0 + ret + } +} + +#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__)) +#define HAS_SUMSQUAREERROR_SSE2 +static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, + int count) { + uint32 sse; + asm volatile ( + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm5,%%xmm5 \n" + "sub %0,%1 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm1 \n" + "movdqa (%0,%1,1),%%xmm2 \n" + "lea 0x10(%0),%0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psubusb %%xmm2,%%xmm1 \n" + "psubusb %%xmm3,%%xmm2 \n" + "por %%xmm2,%%xmm1 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpckhbw %%xmm5,%%xmm2 \n" + "pmaddwd %%xmm1,%%xmm1 \n" + "pmaddwd %%xmm2,%%xmm2 \n" + "paddd %%xmm1,%%xmm0 \n" + "paddd %%xmm2,%%xmm0 \n" + "jg 1b \n" + + "pshufd $0xee,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "pshufd $0x1,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "movd %%xmm0,%3 \n" + + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "=g"(sse) // %3 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm5" +#endif + ); + return sse; +} +#endif + +static uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, + int count) { + uint32 sse = 0u; + for (int i = 0; i < count; ++i) { + int diff = src_a[i] - src_b[i]; + sse += static_cast<uint32>(diff * diff); + } + return sse; +} + +LIBYUV_API +uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, + int count) { + uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) = + SumSquareError_C; +#if defined(HAS_SUMSQUAREERROR_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SumSquareError = SumSquareError_NEON; + } +#elif defined(HAS_SUMSQUAREERROR_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) { + // Note only used for multiples of 16 so count is not checked. + SumSquareError = SumSquareError_SSE2; + } +#endif + // 32K values will fit a 32bit int return value from SumSquareError. + // After each block of 32K, accumulate into 64 bit int. + const int kBlockSize = 1 << 15; // 32768; + uint64 sse = 0; +#ifdef _OPENMP +#pragma omp parallel for reduction(+: sse) +#endif + for (int i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) { + sse += SumSquareError(src_a + i, src_b + i, kBlockSize); + } + src_a += count & ~(kBlockSize - 1); + src_b += count & ~(kBlockSize - 1); + int remainder = count & (kBlockSize - 1) & ~15; + if (remainder) { + sse += SumSquareError(src_a, src_b, remainder); + src_a += remainder; + src_b += remainder; + } + remainder = count & 15; + if (remainder) { + sse += SumSquareError_C(src_a, src_b, remainder); + } + return sse; +} + +LIBYUV_API +uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a, + const uint8* src_b, int stride_b, + int width, int height) { + uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) = + SumSquareError_C; +#if defined(HAS_SUMSQUAREERROR_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SumSquareError = SumSquareError_NEON; + } +#elif defined(HAS_SUMSQUAREERROR_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && + IS_ALIGNED(src_a, 16) && IS_ALIGNED(stride_a, 16) && + IS_ALIGNED(src_b, 16) && IS_ALIGNED(stride_b, 16)) { + SumSquareError = SumSquareError_SSE2; + } +#endif + + uint64 sse = 0; + for (int h = 0; h < height; ++h) { + sse += SumSquareError(src_a, src_b, width); + src_a += stride_a; + src_b += stride_b; + } + + return sse; +} + +LIBYUV_API +double SumSquareErrorToPsnr(uint64 sse, uint64 count) { + double psnr; + if (sse > 0) { + double mse = static_cast<double>(count) / static_cast<double>(sse); + psnr = 10.0 * log10(255.0 * 255.0 * mse); + } else { + psnr = kMaxPsnr; // Limit to prevent divide by 0 + } + + if (psnr > kMaxPsnr) + psnr = kMaxPsnr; + + return psnr; +} + +LIBYUV_API +double CalcFramePsnr(const uint8* src_a, int stride_a, + const uint8* src_b, int stride_b, + int width, int height) { + const uint64 samples = width * height; + const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a, + src_b, stride_b, + width, height); + return SumSquareErrorToPsnr(sse, samples); +} + +LIBYUV_API +double I420Psnr(const uint8* src_y_a, int stride_y_a, + const uint8* src_u_a, int stride_u_a, + const uint8* src_v_a, int stride_v_a, + const uint8* src_y_b, int stride_y_b, + const uint8* src_u_b, int stride_u_b, + const uint8* src_v_b, int stride_v_b, + int width, int height) { + const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a, + src_y_b, stride_y_b, + width, height); + const int width_uv = (width + 1) >> 1; + const int height_uv = (height + 1) >> 1; + const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a, + src_u_b, stride_u_b, + width_uv, height_uv); + const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a, + src_v_b, stride_v_b, + width_uv, height_uv); + const uint64 samples = width * height + 2 * (width_uv * height_uv); + const uint64 sse = sse_y + sse_u + sse_v; + return SumSquareErrorToPsnr(sse, samples); +} + +static const int64 cc1 = 26634; // (64^2*(.01*255)^2 +static const int64 cc2 = 239708; // (64^2*(.03*255)^2 + +static double Ssim8x8_C(const uint8* src_a, int stride_a, + const uint8* src_b, int stride_b) { + int64 sum_a = 0; + int64 sum_b = 0; + int64 sum_sq_a = 0; + int64 sum_sq_b = 0; + int64 sum_axb = 0; + + for (int i = 0; i < 8; ++i) { + for (int j = 0; j < 8; ++j) { + sum_a += src_a[j]; + sum_b += src_b[j]; + sum_sq_a += src_a[j] * src_a[j]; + sum_sq_b += src_b[j] * src_b[j]; + sum_axb += src_a[j] * src_b[j]; + } + + src_a += stride_a; + src_b += stride_b; + } + + const int64 count = 64; + // scale the constants by number of pixels + const int64 c1 = (cc1 * count * count) >> 12; + const int64 c2 = (cc2 * count * count) >> 12; + + const int64 sum_a_x_sum_b = sum_a * sum_b; + + const int64 ssim_n = (2 * sum_a_x_sum_b + c1) * + (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2); + + const int64 sum_a_sq = sum_a*sum_a; + const int64 sum_b_sq = sum_b*sum_b; + + const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) * + (count * sum_sq_a - sum_a_sq + + count * sum_sq_b - sum_b_sq + c2); + + if (ssim_d == 0.0) + return DBL_MAX; + return ssim_n * 1.0 / ssim_d; +} + +// We are using a 8x8 moving window with starting location of each 8x8 window +// on the 4x4 pixel grid. Such arrangement allows the windows to overlap +// block boundaries to penalize blocking artifacts. +LIBYUV_API +double CalcFrameSsim(const uint8* src_a, int stride_a, + const uint8* src_b, int stride_b, + int width, int height) { + int samples = 0; + double ssim_total = 0; + + double (*Ssim8x8)(const uint8* src_a, int stride_a, + const uint8* src_b, int stride_b); + + Ssim8x8 = Ssim8x8_C; + + // sample point start with each 4x4 location + for (int i = 0; i < height - 8; i += 4) { + for (int j = 0; j < width - 8; j += 4) { + ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b); + samples++; + } + + src_a += stride_a * 4; + src_b += stride_b * 4; + } + + ssim_total /= samples; + return ssim_total; +} + +LIBYUV_API +double I420Ssim(const uint8* src_y_a, int stride_y_a, + const uint8* src_u_a, int stride_u_a, + const uint8* src_v_a, int stride_v_a, + const uint8* src_y_b, int stride_y_b, + const uint8* src_u_b, int stride_u_b, + const uint8* src_v_b, int stride_v_b, + int width, int height) { + const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a, + src_y_b, stride_y_b, width, height); + const int width_uv = (width + 1) >> 1; + const int height_uv = (height + 1) >> 1; + const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a, + src_u_b, stride_u_b, + width_uv, height_uv); + const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a, + src_v_b, stride_v_b, + width_uv, height_uv); + return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v); +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/files/source/compare_neon.cc b/files/source/compare_neon.cc new file mode 100644 index 00000000..d8b375b8 --- /dev/null +++ b/files/source/compare_neon.cc @@ -0,0 +1,62 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__) + +uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { + volatile uint32 sse; + asm volatile ( + "vmov.u8 q8, #0 \n" + "vmov.u8 q10, #0 \n" + "vmov.u8 q9, #0 \n" + "vmov.u8 q11, #0 \n" + + ".p2align 2 \n" + "1: \n" + "vld1.u8 {q0}, [%0]! \n" + "vld1.u8 {q1}, [%1]! \n" + "subs %2, %2, #16 \n" + "vsubl.u8 q2, d0, d2 \n" + "vsubl.u8 q3, d1, d3 \n" + "vmlal.s16 q8, d4, d4 \n" + "vmlal.s16 q9, d6, d6 \n" + "vmlal.s16 q10, d5, d5 \n" + "vmlal.s16 q11, d7, d7 \n" + "bgt 1b \n" + + "vadd.u32 q8, q8, q9 \n" + "vadd.u32 q10, q10, q11 \n" + "vadd.u32 q11, q8, q10 \n" + "vpaddl.u32 q1, q11 \n" + "vadd.u64 d0, d2, d3 \n" + "vmov.32 %3, d0[0] \n" + : "+r"(src_a), + "+r"(src_b), + "+r"(count), + "=r"(sse) + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); + return sse; +} + +#endif // __ARM_NEON__ + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + diff --git a/files/source/conversion_tables.h b/files/source/conversion_tables.h index 9a328649..ef3ebf36 100644 --- a/files/source/conversion_tables.h +++ b/files/source/conversion_tables.h @@ -18,7 +18,10 @@ #ifndef LIBYUV_SOURCE_CONVERSION_TABLES_H_ #define LIBYUV_SOURCE_CONVERSION_TABLES_H_ +#ifdef __cplusplus namespace libyuv { +extern "C" { +#endif /****************************************************************************** * YUV TO RGB approximation @@ -197,7 +200,10 @@ namespace libyuv { Vcg(244),Vcg(245),Vcg(246),Vcg(247),Vcg(248),Vcg(249),Vcg(250),Vcg(251), Vcg(252),Vcg(253),Vcg(254),Vcg(255)}; -} // namespace libyuv +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif #endif diff --git a/files/source/convert.cc b/files/source/convert.cc index 8154dcb7..0882c92b 100644 --- a/files/source/convert.cc +++ b/files/source/convert.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -10,174 +10,131 @@ #include "libyuv/convert.h" -#include "conversion_tables.h" #include "libyuv/basic_types.h" #include "libyuv/cpu_id.h" -#include "row.h" - -//#define SCALEOPT //Currently for windows only. June 2010 - -#ifdef SCALEOPT -#include <emmintrin.h> +#include "libyuv/format_conversion.h" +#ifdef HAVE_JPEG +#include "libyuv/mjpeg_decoder.h" #endif +#include "libyuv/planar_functions.h" +#include "libyuv/rotate.h" +#include "libyuv/video_common.h" +#include "libyuv/row.h" +#ifdef __cplusplus namespace libyuv { +extern "C" { +#endif -static inline uint8 Clip(int32 val) { - if (val < 0) { - return (uint8) 0; - } else if (val > 255){ - return (uint8) 255; - } - return (uint8) val; -} - -int I420ToRGB24(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height) { - if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) { +// Copy I420 with optional flipping +LIBYUV_API +int I420Copy(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + if (!src_y || !src_u || !src_v || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { return -1; } + // Negative height means invert the image. + if (height < 0) { + height = -height; + int halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } - // RGB orientation - bottom up - // TODO(fbarchard): support inversion - uint8* out = dst_frame + dst_stride_frame * height - dst_stride_frame; - uint8* out2 = out - dst_stride_frame; - int h, w; - int tmp_r, tmp_g, tmp_b; - const uint8 *y1, *y2 ,*u, *v; - y1 = src_y; - y2 = y1 + src_stride_y; - u = src_u; - v = src_v; - for (h = ((height + 1) >> 1); h > 0; h--){ - // 2 rows at a time, 2 y's at a time - for (w = 0; w < ((width + 1) >> 1); w++){ - // Vertical and horizontal sub-sampling - tmp_r = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8); - tmp_g = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8); - tmp_b = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128) >> 8); - out[0] = Clip(tmp_b); - out[1] = Clip(tmp_g); - out[2] = Clip(tmp_r); - - tmp_r = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128) >> 8); - tmp_g = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8); - tmp_b = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8); - out[3] = Clip(tmp_b); - out[4] = Clip(tmp_g); - out[5] = Clip(tmp_r); - - tmp_r = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8); - tmp_g = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8); - tmp_b = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8); - out2[0] = Clip(tmp_b); - out2[1] = Clip(tmp_g); - out2[2] = Clip(tmp_r); - - tmp_r = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8); - tmp_g = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8); - tmp_b = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8); - out2[3] = Clip(tmp_b); - out2[4] = Clip(tmp_g); - out2[5] = Clip(tmp_r); - - out += 6; - out2 += 6; - y1 += 2; - y2 += 2; - u++; - v++; - } - y1 += src_stride_y + src_stride_y - width; - y2 += src_stride_y + src_stride_y - width; - u += src_stride_u - ((width + 1) >> 1); - v += src_stride_v - ((width + 1) >> 1); - out -= dst_stride_frame * 3; - out2 -= dst_stride_frame * 3; - } // end height for + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); + CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); return 0; } -// Little Endian... -int I420ToARGB4444(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height) { - if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) { - return -1; +// Move to row_win etc. +#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) +#define HAS_HALFROW_SSE2 +__declspec(naked) __declspec(align(16)) +static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, + uint8* dst_uv, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_uv + mov edx, [esp + 4 + 8] // src_uv_stride + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + sub edi, eax + + align 16 + convertloop: + movdqa xmm0, [eax] + pavgb xmm0, [eax + edx] + sub ecx, 16 + movdqa [eax + edi], xmm0 + lea eax, [eax + 16] + jg convertloop + pop edi + ret } +} - // RGB orientation - bottom up - uint8* out = dst_frame + dst_stride_frame * (height - 1); - uint8* out2 = out - dst_stride_frame; - int tmp_r, tmp_g, tmp_b; - const uint8 *y1,*y2, *u, *v; - y1 = src_y; - y2 = y1 + src_stride_y; - u = src_u; - v = src_v; - int h, w; - - for (h = ((height + 1) >> 1); h > 0; h--) { - // 2 rows at a time, 2 y's at a time - for (w = 0; w < ((width + 1) >> 1); w++) { - // Vertical and horizontal sub-sampling - // Convert to RGB888 and re-scale to 4 bits - tmp_r = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8); - tmp_g = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8); - tmp_b = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128) >> 8); - out[0] =(uint8)((Clip(tmp_g) & 0xf0) + (Clip(tmp_b) >> 4)); - out[1] = (uint8)(0xf0 + (Clip(tmp_r) >> 4)); - - tmp_r = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128) >> 8); - tmp_g = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8); - tmp_b = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8); - out[2] = (uint8)((Clip(tmp_g) & 0xf0 ) + (Clip(tmp_b) >> 4)); - out[3] = (uint8)(0xf0 + (Clip(tmp_r) >> 4)); - - tmp_r = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8); - tmp_g = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8); - tmp_b = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8); - out2[0] = (uint8)((Clip(tmp_g) & 0xf0 ) + (Clip(tmp_b) >> 4)); - out2[1] = (uint8) (0xf0 + (Clip(tmp_r) >> 4)); - - tmp_r = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8); - tmp_g = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8); - tmp_b = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8); - out2[2] = (uint8)((Clip(tmp_g) & 0xf0 ) + (Clip(tmp_b) >> 4)); - out2[3] = (uint8)(0xf0 + (Clip(tmp_r) >> 4)); - - out += 4; - out2 += 4; - y1 += 2; - y2 += 2; - u++; - v++; - } - y1 += 2 * src_stride_y - width; - y2 += 2 * src_stride_y - width; - u += src_stride_u - ((width + 1) >> 1); - v += src_stride_v - ((width + 1) >> 1); - out -= (dst_stride_frame + width) * 2; - out2 -= (dst_stride_frame + width) * 2; - } // end height for - return 0; +#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__)) +#define HAS_HALFROW_SSE2 +static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, + uint8* dst_uv, int pix) { + asm volatile ( + "sub %0,%1 \n" + ".p2align 4 \n" +"1: \n" + "movdqa (%0),%%xmm0 \n" + "pavgb (%0,%3),%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0,(%0,%1) \n" + "lea 0x10(%0),%0 \n" + "jg 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_uv), // %1 + "+r"(pix) // %2 + : "r"(static_cast<intptr_t>(src_uv_stride)) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0" +#endif +); } +#endif +static void HalfRow_C(const uint8* src_uv, int src_uv_stride, + uint8* dst_uv, int pix) { + for (int x = 0; x < pix; ++x) { + dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1; + } +} -int I420ToRGB565(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height) { - if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) { +LIBYUV_API +int I422ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + if (!src_y || !src_u || !src_v || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { return -1; } - // Negative height means invert the image. if (height < 0) { height = -height; @@ -188,717 +145,1937 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y, src_stride_u = -src_stride_u; src_stride_v = -src_stride_v; } - uint16* out = (uint16*)(dst_frame) + dst_stride_frame * (height - 1); - uint16* out2 = out - dst_stride_frame; - - int tmp_r, tmp_g, tmp_b; - const uint8* y1,* y2, * u, * v; - y1 = src_y; - y2 = y1 + src_stride_y; - u = src_u; - v = src_v; - int h, w; - - for (h = ((height + 1) >> 1); h > 0; h--){ - // 2 rows at a time, 2 y's at a time - for (w = 0; w < ((width + 1) >> 1); w++){ - // Vertical and horizontal sub-sampling - // 1. Convert to RGB888 - // 2. Shift to adequate location (in the 16 bit word) - RGB 565 - - tmp_r = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8); - tmp_g = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8); - tmp_b = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128) >> 8); - out[0] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g) - & 0xfc) << 3) + (Clip(tmp_b) >> 3); - - tmp_r = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128) >> 8); - tmp_g = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8); - tmp_b = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8); - out[1] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g) - & 0xfc) << 3) + (Clip(tmp_b ) >> 3); - - tmp_r = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8); - tmp_g = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8); - tmp_b = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8); - out2[0] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g) - & 0xfc) << 3) + (Clip(tmp_b) >> 3); - - tmp_r = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8); - tmp_g = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8); - tmp_b = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8); - out2[1] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g) - & 0xfc) << 3) + (Clip(tmp_b) >> 3); - - y1 += 2; - y2 += 2; - out += 2; - out2 += 2; - u++; - v++; - } - y1 += 2 * src_stride_y - width; - y2 += 2 * src_stride_y - width; - u += src_stride_u - ((width + 1) >> 1); - v += src_stride_v - ((width + 1) >> 1); - out -= 2 * dst_stride_frame + width; - out2 -= 2 * dst_stride_frame + width; + int halfwidth = (width + 1) >> 1; + void (*HalfRow)(const uint8* src_uv, int src_uv_stride, + uint8* dst_uv, int pix) = HalfRow_C; +#if defined(HAS_HALFROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(halfwidth, 16) && + IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) && + IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) && + IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) && + IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) { + HalfRow = HalfRow_SSE2; + } +#endif + + // Copy Y plane + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + + // SubSample U plane. + int y; + for (y = 0; y < height - 1; y += 2) { + HalfRow(src_u, src_stride_u, dst_u, halfwidth); + src_u += src_stride_u * 2; + dst_u += dst_stride_u; + } + if (height & 1) { + HalfRow(src_u, 0, dst_u, halfwidth); + } + + // SubSample V plane. + for (y = 0; y < height - 1; y += 2) { + HalfRow(src_v, src_stride_v, dst_v, halfwidth); + src_v += src_stride_v * 2; + dst_v += dst_stride_v; + } + if (height & 1) { + HalfRow(src_v, 0, dst_v, halfwidth); } return 0; } +// Blends 32x2 pixels to 16x1 +// source in scale.cc +#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) +#define HAS_SCALEROWDOWN2_NEON +void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +#elif !defined(YUV_DISABLE_ASM) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) + +void ScaleRowDown2Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +#endif +void ScaleRowDown2Int_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); -int I420ToARGB1555(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height) { - if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) { +LIBYUV_API +int I444ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + if (!src_y || !src_u || !src_v || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { return -1; } - uint16* out = (uint16*)(dst_frame) + dst_stride_frame * (height - 1); - uint16* out2 = out - dst_stride_frame ; - int32 tmp_r, tmp_g, tmp_b; - const uint8 *y1,*y2, *u, *v; - int h, w; - - y1 = src_y; - y2 = y1 + src_stride_y; - u = src_u; - v = src_v; - - for (h = ((height + 1) >> 1); h > 0; h--){ - // 2 rows at a time, 2 y's at a time - for (w = 0; w < ((width + 1) >> 1); w++){ - // Vertical and horizontal sub-sampling - // 1. Convert to RGB888 - // 2. Shift to adequate location (in the 16 bit word) - RGB 555 - // 3. Add 1 for alpha value - tmp_r = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8); - tmp_g = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8); - tmp_b = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128) >> 8); - out[0] = (uint16)(0x8000 + ((Clip(tmp_r) & 0xf8) << 10) + - ((Clip(tmp_g) & 0xf8) << 3) + (Clip(tmp_b) >> 3)); - - tmp_r = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128) >> 8); - tmp_g = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8); - tmp_b = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8); - out[1] = (uint16)(0x8000 + ((Clip(tmp_r) & 0xf8) << 10) + - ((Clip(tmp_g) & 0xf8) << 3) + (Clip(tmp_b) >> 3)); - - tmp_r = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8); - tmp_g = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8); - tmp_b = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8); - out2[0] = (uint16)(0x8000 + ((Clip(tmp_r) & 0xf8) << 10) + - ((Clip(tmp_g) & 0xf8) << 3) + (Clip(tmp_b) >> 3)); - - tmp_r = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8); - tmp_g = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8); - tmp_b = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8); - out2[1] = (uint16)(0x8000 + ((Clip(tmp_r) & 0xf8) << 10) + - ((Clip(tmp_g) & 0xf8) << 3) + (Clip(tmp_b) >> 3)); - - y1 += 2; - y2 += 2; - out += 2; - out2 += 2; - u++; - v++; - } - y1 += 2 * src_stride_y - width; - y2 += 2 * src_stride_y - width; - u += src_stride_u - ((width + 1) >> 1); - v += src_stride_v - ((width + 1) >> 1); - out -= 2 * dst_stride_frame + width; - out2 -= 2 * dst_stride_frame + width; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + int halfwidth = (width + 1) >> 1; + void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) = ScaleRowDown2Int_C; +#if defined(HAS_SCALEROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON) && + IS_ALIGNED(halfwidth, 16)) { + ScaleRowDown2 = ScaleRowDown2Int_NEON; + } +#elif defined(HAS_SCALEROWDOWN2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(halfwidth, 16) && + IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) && + IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) && + IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) && + IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) { + ScaleRowDown2 = ScaleRowDown2Int_SSE2; + } +#endif + + // Copy Y plane + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + + // SubSample U plane. + int y; + for (y = 0; y < height - 1; y += 2) { + ScaleRowDown2(src_u, src_stride_u, dst_u, halfwidth); + src_u += src_stride_u * 2; + dst_u += dst_stride_u; + } + if (height & 1) { + ScaleRowDown2(src_u, 0, dst_u, halfwidth); + } + + // SubSample V plane. + for (y = 0; y < height - 1; y += 2) { + ScaleRowDown2(src_v, src_stride_v, dst_v, halfwidth); + src_v += src_stride_v * 2; + dst_v += dst_stride_v; + } + if (height & 1) { + ScaleRowDown2(src_v, 0, dst_v, halfwidth); } return 0; } +// use Bilinear for upsampling chroma +void ScalePlaneBilinear(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr); -int I420ToYUY2(const uint8* src_y, int src_stride_y, +// 411 chroma is 1/4 width, 1x height +// 420 chroma is 1/2 width, 1/2 height +LIBYUV_API +int I411ToI420(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, int width, int height) { - if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) { + if (!src_y || !src_u || !src_v || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { return -1; } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_u = dst_u + (height - 1) * dst_stride_u; + dst_v = dst_v + (height - 1) * dst_stride_v; + dst_stride_y = -dst_stride_y; + dst_stride_u = -dst_stride_u; + dst_stride_v = -dst_stride_v; + } - const uint8* in1 = src_y; - const uint8* in2 = src_y + src_stride_y; - - uint8* out1 = dst_frame; - uint8* out2 = dst_frame + dst_stride_frame; - - // YUY2 - Macro-pixel = 2 image pixels - // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... -#ifndef SCALEOPT - for (int i = 0; i < ((height + 1) >> 1); i++){ - for (int j = 0; j < ((width + 1) >> 1); j++){ - out1[0] = in1[0]; - out1[1] = *src_u; - out1[2] = in1[1]; - out1[3] = *src_v; - - out2[0] = in2[0]; - out2[1] = *src_u; - out2[2] = in2[1]; - out2[3] = *src_v; - out1 += 4; - out2 += 4; - src_u++; - src_v++; - in1 += 2; - in2 += 2; - } - in1 += 2 * src_stride_y - width; - in2 += 2 * src_stride_y - width; - src_u += src_stride_u - ((width + 1) >> 1); - src_v += src_stride_v - ((width + 1) >> 1); - out1 += dst_stride_frame + dst_stride_frame - 2 * width; - out2 += dst_stride_frame + dst_stride_frame - 2 * width; + // Copy Y plane + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); } -#else - for (WebRtc_UWord32 i = 0; i < ((height + 1) >> 1);i++) { - int32 width__ = (width >> 4); - _asm - { - ;pusha - mov eax, DWORD PTR [in1] ;1939.33 - mov ecx, DWORD PTR [in2] ;1939.33 - mov ebx, DWORD PTR [src_u] ;1939.33 - mov edx, DWORD PTR [src_v] ;1939.33 - loop0: - movq xmm6, QWORD PTR [ebx] ;src_u - movq xmm0, QWORD PTR [edx] ;src_v - punpcklbw xmm6, xmm0 ;src_u, src_v mix - ;movdqa xmm1, xmm6 - ;movdqa xmm2, xmm6 - ;movdqa xmm4, xmm6 - - movdqu xmm3, XMMWORD PTR [eax] ;in1 - movdqa xmm1, xmm3 - punpcklbw xmm1, xmm6 ;in1, src_u, in1, src_v - mov esi, DWORD PTR [out1] - movdqu XMMWORD PTR [esi], xmm1 ;write to out1 - - movdqu xmm5, XMMWORD PTR [ecx] ;in2 - movdqa xmm2, xmm5 - punpcklbw xmm2, xmm6 ;in2, src_u, in2, src_v - mov edi, DWORD PTR [out2] - movdqu XMMWORD PTR [edi], xmm2 ;write to out2 - - punpckhbw xmm3, xmm6 ;in1, src_u, in1, src_v again - movdqu XMMWORD PTR [esi+16], xmm3 ;write to out1 again - add esi, 32 - mov DWORD PTR [out1], esi - - punpckhbw xmm5, xmm6 ;src_u, in2, src_v again - movdqu XMMWORD PTR [edi+16], xmm5 ;write to out2 again - add edi, 32 - mov DWORD PTR [out2], edi - - add ebx, 8 - add edx, 8 - add eax, 16 - add ecx, 16 - - mov esi, DWORD PTR [width__] - sub esi, 1 - mov DWORD PTR [width__], esi - jg loop0 - - mov DWORD PTR [in1], eax ;1939.33 - mov DWORD PTR [in2], ecx ;1939.33 - mov DWORD PTR [src_u], ebx ;1939.33 - mov DWORD PTR [src_v], edx ;1939.33 - - ;popa - emms - } - in1 += 2 * src_stride_y - width; - in2 += 2 * src_stride_y - width; - out1 += dst_stride_frame + dst_stride_frame - 2 * width; - out2 += dst_stride_frame + dst_stride_frame - 2 * width; + + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + int quarterwidth = (width + 3) >> 2; + + // Resample U plane. + ScalePlaneBilinear(quarterwidth, height, // from 1/4 width, 1x height + halfwidth, halfheight, // to 1/2 width, 1/2 height + src_stride_u, + dst_stride_u, + src_u, dst_u); + + // Resample V plane. + ScalePlaneBilinear(quarterwidth, height, // from 1/4 width, 1x height + halfwidth, halfheight, // to 1/2 width, 1/2 height + src_stride_v, + dst_stride_v, + src_v, dst_v); + return 0; +} + +// I400 is greyscale typically used in MJPG +LIBYUV_API +int I400ToI420(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + if (!src_y || !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128); + SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128); + return 0; +} + +static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, + uint8* dst, int dst_stride_frame, + int width, int height) { + void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) { + CopyRow = CopyRow_NEON; + } +#elif defined(HAS_COPYROW_X86) + if (IS_ALIGNED(width, 4)) { + CopyRow = CopyRow_X86; +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(width, 32) && IS_ALIGNED(src, 16) && + IS_ALIGNED(src_stride_0, 16) && IS_ALIGNED(src_stride_1, 16) && + IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride_frame, 16)) { + CopyRow = CopyRow_SSE2; + } +#endif + } +#endif + + // Copy plane + for (int y = 0; y < height - 1; y += 2) { + CopyRow(src, dst, width); + CopyRow(src + src_stride_0, dst + dst_stride_frame, width); + src += src_stride_0 + src_stride_1; + dst += dst_stride_frame * 2; + } + if (height & 1) { + CopyRow(src, dst, width); + } +} + +// Support converting from FOURCC_M420 +// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for +// easy conversion to I420. +// M420 format description: +// M420 is row biplanar 420: 2 rows of Y and 1 row of UV. +// Chroma is half width / half height. (420) +// src_stride_m420 is row planar. Normally this will be the width in pixels. +// The UV plane is half width, but 2 values, so src_stride_m420 applies to +// this as well as the two Y planes. +static int X420ToI420(const uint8* src_y, + int src_stride_y0, int src_stride_y1, + const uint8* src_uv, int src_stride_uv, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + if (!src_y || !src_uv || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + int halfheight = (height + 1) >> 1; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_u = dst_u + (halfheight - 1) * dst_stride_u; + dst_v = dst_v + (halfheight - 1) * dst_stride_v; + dst_stride_y = -dst_stride_y; + dst_stride_u = -dst_stride_u; + dst_stride_v = -dst_stride_v; + } + + int halfwidth = (width + 1) >> 1; + void (*SplitUV)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) = + SplitUV_C; +#if defined(HAS_SPLITUV_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 16)) { + SplitUV = SplitUV_NEON; + } +#elif defined(HAS_SPLITUV_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(halfwidth, 16) && + IS_ALIGNED(src_uv, 16) && IS_ALIGNED(src_stride_uv, 16) && + IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) && + IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) { + SplitUV = SplitUV_SSE2; } #endif + + if (dst_y) { + CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y, + width, height); + } + + int halfheight = (height + 1) >> 1; + for (int y = 0; y < halfheight; ++y) { + // Copy a row of UV. + SplitUV(src_uv, dst_u, dst_v, halfwidth); + dst_u += dst_stride_u; + dst_v += dst_stride_v; + src_uv += src_stride_uv; + } return 0; } -int I420ToUYVY(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, +// Convert NV12 to I420. +LIBYUV_API +int NV12ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, int width, int height) { - if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) { + return X420ToI420(src_y, src_stride_y, src_stride_y, + src_uv, src_stride_uv, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height); +} + +// Convert M420 to I420. +LIBYUV_API +int M420ToI420(const uint8* src_m420, int src_stride_m420, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2, + src_m420 + src_stride_m420 * 2, src_stride_m420 * 3, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height); +} + +// Convert Q420 to I420. +// Format is rows of YY/YUYV +LIBYUV_API +int Q420ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + if (!src_y || !src_yuy2 || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { return -1; } + // Negative height means invert the image. + if (height < 0) { + height = -height; + int halfheight = (height + 1) >> 1; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_u = dst_u + (halfheight - 1) * dst_stride_u; + dst_v = dst_v + (halfheight - 1) * dst_stride_v; + dst_stride_y = -dst_stride_y; + dst_stride_u = -dst_stride_u; + dst_stride_v = -dst_stride_v; + } + // CopyRow for rows of just Y in Q420 copied to Y plane of I420. + void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) { + CopyRow = CopyRow_NEON; + } +#endif +#if defined(HAS_COPYROW_X86) + if (IS_ALIGNED(width, 4)) { + CopyRow = CopyRow_X86; + } +#endif +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) && + IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && + IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + CopyRow = CopyRow_SSE2; + } +#endif - int i = 0; - const uint8* y1 = src_y; - const uint8* y2 = y1 + src_stride_y; - const uint8* u = src_u; - const uint8* v = src_v; - - uint8* out1 = dst_frame; - uint8* out2 = dst_frame + dst_stride_frame; - - // Macro-pixel = 2 image pixels - // U0Y0V0Y1....U2Y2V2Y3...U4Y4V4Y5..... - -#ifndef SCALEOPT - for (; i < ((height + 1) >> 1); i++) { - for (int j = 0; j < ((width + 1) >> 1); j++) { - out1[0] = *u; - out1[1] = y1[0]; - out1[2] = *v; - out1[3] = y1[1]; - - out2[0] = *u; - out2[1] = y2[0]; - out2[2] = *v; - out2[3] = y2[1]; - out1 += 4; - out2 += 4; - u++; - v++; - y1 += 2; - y2 += 2; - } - y1 += 2 * src_stride_y - width; - y2 += 2 * src_stride_y - width; - u += src_stride_u - ((width + 1) >> 1); - v += src_stride_v - ((width + 1) >> 1); - out1 += 2 * (dst_stride_frame - width); - out2 += 2 * (dst_stride_frame - width); + void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, + int pix) = YUY2ToUV422Row_C; + void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) = + YUY2ToYRow_C; +#if defined(HAS_YUY2TOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + if (width > 16) { + YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2; + YUY2ToYRow = YUY2ToYRow_Any_SSE2; + } + if (IS_ALIGNED(width, 16)) { + YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2; + YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2; + if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) { + YUY2ToUV422Row = YUY2ToUV422Row_SSE2; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + YUY2ToYRow = YUY2ToYRow_SSE2; + } + } + } } -#else - for (; i < (height >> 1);i++) { - int32 width__ = (width >> 4); - _asm - { - ;pusha - mov eax, DWORD PTR [in1] ;1939.33 - mov ecx, DWORD PTR [in2] ;1939.33 - mov ebx, DWORD PTR [src_u] ;1939.33 - mov edx, DWORD PTR [src_v] ;1939.33 -loop0: - movq xmm6, QWORD PTR [ebx] ;src_u - movq xmm0, QWORD PTR [edx] ;src_v - punpcklbw xmm6, xmm0 ;src_u, src_v mix - movdqa xmm1, xmm6 - movdqa xmm2, xmm6 - movdqa xmm4, xmm6 - - movdqu xmm3, XMMWORD PTR [eax] ;in1 - punpcklbw xmm1, xmm3 ;src_u, in1, src_v - mov esi, DWORD PTR [out1] - movdqu XMMWORD PTR [esi], xmm1 ;write to out1 - - movdqu xmm5, XMMWORD PTR [ecx] ;in2 - punpcklbw xmm2, xmm5 ;src_u, in2, src_v - mov edi, DWORD PTR [out2] - movdqu XMMWORD PTR [edi], xmm2 ;write to out2 - - punpckhbw xmm4, xmm3 ;src_u, in1, src_v again - movdqu XMMWORD PTR [esi+16], xmm4 ;write to out1 again - add esi, 32 - mov DWORD PTR [out1], esi - - punpckhbw xmm6, xmm5 ;src_u, in2, src_v again - movdqu XMMWORD PTR [edi+16], xmm6 ;write to out2 again - add edi, 32 - mov DWORD PTR [out2], edi - - add ebx, 8 - add edx, 8 - add eax, 16 - add ecx, 16 - - mov esi, DWORD PTR [width__] - sub esi, 1 - mov DWORD PTR [width__], esi - jg loop0 - - mov DWORD PTR [in1], eax ;1939.33 - mov DWORD PTR [in2], ecx ;1939.33 - mov DWORD PTR [src_u], ebx ;1939.33 - mov DWORD PTR [src_v], edx ;1939.33 - - ;popa - emms - } - in1 += width; - in2 += width; - out1 += 2 * (dst_stride_frame - width); - out2 += 2 * (dst_stride_frame - width); +#elif defined(HAS_YUY2TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + if (width > 8) { + YUY2ToYRow = YUY2ToYRow_Any_NEON; + if (width > 16) { + YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON; + } + } + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_NEON; + YUY2ToUV422Row = YUY2ToUV422Row_NEON; + } } #endif + + for (int y = 0; y < height - 1; y += 2) { + CopyRow(src_y, dst_y, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + + YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width); + YUY2ToYRow(src_yuy2, dst_y, width); + src_yuy2 += src_stride_yuy2; + dst_y += dst_stride_y; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + CopyRow(src_y, dst_y, width); + YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width); + } return 0; } +// Test if over reading on source is safe. +// TODO(fbarchard): Find more efficient solution to safely do odd sizes. +// Macros to control read policy, from slowest to fastest: +// READSAFE_NEVER - disables read ahead on systems with strict memory reads +// READSAFE_ODDHEIGHT - last row of odd height done with C. +// This policy assumes that the caller handles the last row of an odd height +// image using C. +// READSAFE_PAGE - enable read ahead within same page. +// A page is 4096 bytes. When reading ahead, if the last pixel is near the +// end the page, and a read spans the page into the next page, a memory +// exception can occur if that page has not been allocated, or is a guard +// page. This setting ensures the overread is within the same page. +// READSAFE_ALWAYS - enables read ahead on systems without memory exceptions +// or where buffers are padded by 64 bytes. + +#if defined(HAS_RGB24TOARGBROW_SSSE3) || \ + defined(HAS_RGB24TOARGBROW_SSSE3) || \ + defined(HAS_RAWTOARGBROW_SSSE3) || \ + defined(HAS_RGB565TOARGBROW_SSE2) || \ + defined(HAS_ARGB1555TOARGBROW_SSE2) || \ + defined(HAS_ARGB4444TOARGBROW_SSE2) + +#define READSAFE_ODDHEIGHT + +static bool TestReadSafe(const uint8* src_yuy2, int src_stride_yuy2, + int width, int height, int bpp, int overread) { + if (width > kMaxStride) { + return false; + } +#if defined(READSAFE_ALWAYS) + return true; +#elif defined(READSAFE_NEVER) + return false; +#elif defined(READSAFE_ODDHEIGHT) + if (!(width & 15) || + (src_stride_yuy2 >= 0 && (height & 1) && width * bpp >= overread)) { + return true; + } + return false; +#elif defined(READSAFE_PAGE) + if (src_stride_yuy2 >= 0) { + src_yuy2 += (height - 1) * src_stride_yuy2; + } + uintptr_t last_adr = (uintptr_t)(src_yuy2) + width * bpp - 1; + uintptr_t last_read_adr = last_adr + overread - 1; + if (((last_adr ^ last_read_adr) & ~4095) == 0) { + return true; + } + return false; +#endif +} +#endif -int NV12ToRGB565(const uint8* src_y, int src_stride_y, - const uint8* src_uv, int src_stride_uv, - uint8* dst_frame, int dst_stride_frame, - int width, int height) { - if (src_y == NULL || src_uv == NULL || dst_frame == NULL) { - return -1; +// Convert YUY2 to I420. +LIBYUV_API +int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; + src_stride_yuy2 = -src_stride_yuy2; + } + void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); + void (*YUY2ToYRow)(const uint8* src_yuy2, + uint8* dst_y, int pix); + YUY2ToYRow = YUY2ToYRow_C; + YUY2ToUVRow = YUY2ToUVRow_C; +#if defined(HAS_YUY2TOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + if (width > 16) { + YUY2ToUVRow = YUY2ToUVRow_Any_SSE2; + YUY2ToYRow = YUY2ToYRow_Any_SSE2; + } + if (IS_ALIGNED(width, 16)) { + YUY2ToUVRow = YUY2ToUVRow_Unaligned_SSE2; + YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2; + if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) { + YUY2ToUVRow = YUY2ToUVRow_SSE2; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + YUY2ToYRow = YUY2ToYRow_SSE2; + } + } + } } +#elif defined(HAS_YUY2TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + if (width > 8) { + YUY2ToYRow = YUY2ToYRow_Any_NEON; + if (width > 16) { + YUY2ToUVRow = YUY2ToUVRow_Any_NEON; + } + } + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_NEON; + YUY2ToUVRow = YUY2ToUVRow_NEON; + } + } +#endif - // Bi-Planar: Y plane followed by an interlaced U and V plane - const uint8* interlacedSrc = src_uv; - uint16* out = (uint16*)(src_y) + dst_stride_frame * (height - 1); - uint16* out2 = out - dst_stride_frame; - int32 tmp_r, tmp_g, tmp_b; - const uint8 *y1,*y2; - y1 = src_y; - y2 = y1 + src_stride_y; - int h, w; - - for (h = ((height + 1) >> 1); h > 0; h--) { - // 2 rows at a time, 2 y's at a time - for (w = 0; w < ((width + 1) >> 1); w++) { - // Vertical and horizontal sub-sampling - // 1. Convert to RGB888 - // 2. Shift to adequate location (in the 16 bit word) - RGB 565 - - tmp_r = (int32)((mapYc[y1[0]] + mapVcr[interlacedSrc[1]] + 128) >> 8); - tmp_g = (int32)((mapYc[y1[0]] + mapUcg[interlacedSrc[0]] - + mapVcg[interlacedSrc[1]] + 128) >> 8); - tmp_b = (int32)((mapYc[y1[0]] + mapUcb[interlacedSrc[0]] + 128) >> 8); - out[0] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g) - & 0xfc) << 3) + (Clip(tmp_b) >> 3); - - tmp_r = (int32)((mapYc[y1[1]] + mapVcr[interlacedSrc[1]] + 128) >> 8); - tmp_g = (int32)((mapYc[y1[1]] + mapUcg[interlacedSrc[0]] - + mapVcg[interlacedSrc[1]] + 128) >> 8); - tmp_b = (int32)((mapYc[y1[1]] + mapUcb[interlacedSrc[0]] + 128) >> 8); - out[1] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g) - & 0xfc) << 3) + (Clip(tmp_b ) >> 3); - - tmp_r = (int32)((mapYc[y2[0]] + mapVcr[interlacedSrc[1]] + 128) >> 8); - tmp_g = (int32)((mapYc[y2[0]] + mapUcg[interlacedSrc[0]] - + mapVcg[interlacedSrc[1]] + 128) >> 8); - tmp_b = (int32)((mapYc[y2[0]] + mapUcb[interlacedSrc[0]] + 128) >> 8); - out2[0] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g) - & 0xfc) << 3) + (Clip(tmp_b) >> 3); - - tmp_r = (int32)((mapYc[y2[1]] + mapVcr[interlacedSrc[1]] - + 128) >> 8); - tmp_g = (int32)((mapYc[y2[1]] + mapUcg[interlacedSrc[0]] - + mapVcg[interlacedSrc[1]] + 128) >> 8); - tmp_b = (int32)((mapYc[y2[1]] + mapUcb[interlacedSrc[0]] + 128) >> 8); - out2[1] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g) - & 0xfc) << 3) + (Clip(tmp_b) >> 3); - - y1 += 2; - y2 += 2; - out += 2; - out2 += 2; - interlacedSrc += 2; - } - y1 += 2 * src_stride_y - width; - y2 += 2 * src_stride_y - width; - interlacedSrc += src_stride_uv - ((width + 1) >> 1); - out -= 3 * dst_stride_frame + dst_stride_frame - width; - out2 -= 3 * dst_stride_frame + dst_stride_frame - width; + for (int y = 0; y < height - 1; y += 2) { + YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width); + YUY2ToYRow(src_yuy2, dst_y, width); + YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width); + src_yuy2 += src_stride_yuy2 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width); + YUY2ToYRow(src_yuy2, dst_y, width); } return 0; } -// TODO(fbarchard): Deprecated - this is same as BG24ToARGB with -height -int RGB24ToARGB(const uint8* src_frame, int src_stride_frame, - uint8* dst_frame, int dst_stride_frame, - int width, int height) { - if (src_frame == NULL || dst_frame == NULL) { - return -1; +// Convert UYVY to I420. +LIBYUV_API +int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; + src_stride_uyvy = -src_stride_uyvy; } + void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); + void (*UYVYToYRow)(const uint8* src_uyvy, + uint8* dst_y, int pix); + UYVYToYRow = UYVYToYRow_C; + UYVYToUVRow = UYVYToUVRow_C; +#if defined(HAS_UYVYTOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + if (width > 16) { + UYVYToUVRow = UYVYToUVRow_Any_SSE2; + UYVYToYRow = UYVYToYRow_Any_SSE2; + } + if (IS_ALIGNED(width, 16)) { + UYVYToUVRow = UYVYToUVRow_Unaligned_SSE2; + UYVYToYRow = UYVYToYRow_Unaligned_SSE2; + if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) { + UYVYToUVRow = UYVYToUVRow_SSE2; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + UYVYToYRow = UYVYToYRow_SSE2; + } + } + } + } +#elif defined(HAS_UYVYTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + if (width > 8) { + UYVYToYRow = UYVYToYRow_Any_NEON; + if (width > 16) { + UYVYToUVRow = UYVYToUVRow_Any_NEON; + } + } + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_NEON; + UYVYToUVRow = UYVYToUVRow_NEON; + } + } +#endif - int i, j, offset; - uint8* outFrame = dst_frame; - const uint8* inFrame = src_frame; + for (int y = 0; y < height - 1; y += 2) { + UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width); + UYVYToYRow(src_uyvy, dst_y, width); + UYVYToYRow(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width); + src_uyvy += src_stride_uyvy * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width); + UYVYToYRow(src_uyvy, dst_y, width); + } + return 0; +} - outFrame += dst_stride_frame * (height - 1) * 4; - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - offset = j * 4; - outFrame[0 + offset] = inFrame[0]; - outFrame[1 + offset] = inFrame[1]; - outFrame[2 + offset] = inFrame[2]; - outFrame[3 + offset] = 0xff; - inFrame += 3; +// Visual C x86 or GCC little endian. +#if defined(__x86_64__) || defined(_M_X64) || \ + defined(__i386__) || defined(_M_IX86) || \ + defined(__arm__) || defined(_M_ARM) || \ + (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +#define LIBYUV_LITTLE_ENDIAN +#endif + +#ifdef LIBYUV_LITTLE_ENDIAN +#define READWORD(p) (*reinterpret_cast<const uint32*>(p)) +#else +static inline uint32 READWORD(const uint8* p) { + return static_cast<uint32>(p[0]) | + (static_cast<uint32>(p[1]) << 8) | + (static_cast<uint32>(p[2]) << 16) | + (static_cast<uint32>(p[3]) << 24); +} +#endif + +// Must be multiple of 6 pixels. Will over convert to handle remainder. +// https://developer.apple.com/quicktime/icefloe/dispatch019.html#v210 +static void V210ToUYVYRow_C(const uint8* src_v210, uint8* dst_uyvy, int width) { + for (int x = 0; x < width; x += 6) { + uint32 w = READWORD(src_v210 + 0); + dst_uyvy[0] = (w >> 2) & 0xff; + dst_uyvy[1] = (w >> 12) & 0xff; + dst_uyvy[2] = (w >> 22) & 0xff; + + w = READWORD(src_v210 + 4); + dst_uyvy[3] = (w >> 2) & 0xff; + dst_uyvy[4] = (w >> 12) & 0xff; + dst_uyvy[5] = (w >> 22) & 0xff; + + w = READWORD(src_v210 + 8); + dst_uyvy[6] = (w >> 2) & 0xff; + dst_uyvy[7] = (w >> 12) & 0xff; + dst_uyvy[8] = (w >> 22) & 0xff; + + w = READWORD(src_v210 + 12); + dst_uyvy[9] = (w >> 2) & 0xff; + dst_uyvy[10] = (w >> 12) & 0xff; + dst_uyvy[11] = (w >> 22) & 0xff; + + src_v210 += 16; + dst_uyvy += 12; + } +} + +// Convert V210 to I420. +// V210 is 10 bit version of UYVY. 16 bytes to store 6 pixels. +// With is multiple of 48. +LIBYUV_API +int V210ToI420(const uint8* src_v210, int src_stride_v210, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + if (width * 2 * 2 > kMaxStride) { // 2 rows of UYVY are required. + return -1; + } else if (!src_v210 || !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_v210 = src_v210 + (height - 1) * src_stride_v210; + src_stride_v210 = -src_stride_v210; + } + SIMD_ALIGNED(uint8 row[kMaxStride * 2]); + void (*V210ToUYVYRow)(const uint8* src_v210, uint8* dst_uyvy, int pix); + V210ToUYVYRow = V210ToUYVYRow_C; + + void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); + void (*UYVYToYRow)(const uint8* src_uyvy, + uint8* dst_y, int pix); + UYVYToYRow = UYVYToYRow_C; + UYVYToUVRow = UYVYToUVRow_C; +#if defined(HAS_UYVYTOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) { + UYVYToUVRow = UYVYToUVRow_SSE2; + UYVYToYRow = UYVYToYRow_Unaligned_SSE2; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + UYVYToYRow = UYVYToYRow_SSE2; + } + } +#elif defined(HAS_UYVYTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + if (width > 8) { + UYVYToYRow = UYVYToYRow_Any_NEON; + if (width > 16) { + UYVYToUVRow = UYVYToUVRow_Any_NEON; + } } - outFrame -= 4 * (dst_stride_frame - width); - inFrame += src_stride_frame - width; + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_NEON; + UYVYToUVRow = UYVYToUVRow_NEON; + } + } +#endif + +#if defined(HAS_UYVYTOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + if (width > 16) { + UYVYToUVRow = UYVYToUVRow_Any_SSE2; + UYVYToYRow = UYVYToYRow_Any_SSE2; + } + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_Unaligned_SSE2; + UYVYToUVRow = UYVYToUVRow_SSE2; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + UYVYToYRow = UYVYToYRow_SSE2; + } + } + } +#elif defined(HAS_UYVYTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + if (width > 8) { + UYVYToYRow = UYVYToYRow_Any_NEON; + if (width > 16) { + UYVYToUVRow = UYVYToUVRow_Any_NEON; + } + } + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_NEON; + UYVYToUVRow = UYVYToUVRow_NEON; + } + } +#endif + + for (int y = 0; y < height - 1; y += 2) { + V210ToUYVYRow(src_v210, row, width); + V210ToUYVYRow(src_v210 + src_stride_v210, row + kMaxStride, width); + UYVYToUVRow(row, kMaxStride, dst_u, dst_v, width); + UYVYToYRow(row, dst_y, width); + UYVYToYRow(row + kMaxStride, dst_y + dst_stride_y, width); + src_v210 += src_stride_v210 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + V210ToUYVYRow(src_v210, row, width); + UYVYToUVRow(row, 0, dst_u, dst_v, width); + UYVYToYRow(row, dst_y, width); } return 0; } -int ARGBToI420(const uint8* src_frame, int src_stride_frame, +LIBYUV_API +int ARGBToI420(const uint8* src_argb, int src_stride_argb, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int width, int height) { + if (!src_argb || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. if (height < 0) { height = -height; - src_frame = src_frame + (height - 1) * src_stride_frame; - src_stride_frame = -src_stride_frame; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; } void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix); void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); + + ARGBToYRow = ARGBToYRow_C; + ARGBToUVRow = ARGBToUVRow_C; #if defined(HAS_ARGBTOYROW_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && - (width % 16 == 0) && - IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && - IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } else -#endif - { - ARGBToYRow = ARGBToYRow_C; - } -#if defined(HAS_ARGBTOUVROW_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && - (width % 16 == 0) && - IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && - IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && - IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; - } else -#endif - { - ARGBToUVRow = ARGBToUVRow_C; - } - - for (int y = 0; y < (height - 1); y += 2) { - ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width); - ARGBToYRow(src_frame, dst_y, width); - ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width); - src_frame += src_stride_frame * 2; + if (TestCpuFlag(kCpuHasSSSE3)) { + if (width > 16) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; + } + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3; + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } + } +#endif + + for (int y = 0; y < height - 1; y += 2) { + ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width); + ARGBToYRow(src_argb, dst_y, width); + ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); + src_argb += src_stride_argb * 2; dst_y += dst_stride_y * 2; dst_u += dst_stride_u; dst_v += dst_stride_v; } if (height & 1) { - ARGBToUVRow(src_frame, 0, dst_u, dst_v, width); - ARGBToYRow(src_frame, dst_y, width); + ARGBToUVRow(src_argb, 0, dst_u, dst_v, width); + ARGBToYRow(src_argb, dst_y, width); } return 0; } -int BGRAToI420(const uint8* src_frame, int src_stride_frame, +LIBYUV_API +int BGRAToI420(const uint8* src_bgra, int src_stride_bgra, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int width, int height) { + if (!src_bgra || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. if (height < 0) { height = -height; - src_frame = src_frame + (height - 1) * src_stride_frame; - src_stride_frame = -src_stride_frame; + src_bgra = src_bgra + (height - 1) * src_stride_bgra; + src_stride_bgra = -src_stride_bgra; } - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix); - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int pix); + void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra, uint8* dst_u, uint8* dst_v, int width); + + BGRAToYRow = BGRAToYRow_C; + BGRAToUVRow = BGRAToUVRow_C; #if defined(HAS_BGRATOYROW_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && - (width % 16 == 0) && - IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && - IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { - ARGBToYRow = BGRAToYRow_SSSE3; - } else -#endif - { - ARGBToYRow = BGRAToYRow_C; - } -#if defined(HAS_BGRATOUVROW_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && - (width % 16 == 0) && - IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && - IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && - IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) { - ARGBToUVRow = BGRAToUVRow_SSSE3; - } else -#endif - { - ARGBToUVRow = BGRAToUVRow_C; - } - - for (int y = 0; y < (height - 1); y += 2) { - ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width); - ARGBToYRow(src_frame, dst_y, width); - ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width); - src_frame += src_stride_frame * 2; + if (TestCpuFlag(kCpuHasSSSE3)) { + if (width > 16) { + BGRAToUVRow = BGRAToUVRow_Any_SSSE3; + BGRAToYRow = BGRAToYRow_Any_SSSE3; + } + if (IS_ALIGNED(width, 16)) { + BGRAToUVRow = BGRAToUVRow_Unaligned_SSSE3; + BGRAToYRow = BGRAToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16)) { + BGRAToUVRow = BGRAToUVRow_SSSE3; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + BGRAToYRow = BGRAToYRow_SSSE3; + } + } + } + } +#endif + + for (int y = 0; y < height - 1; y += 2) { + BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width); + BGRAToYRow(src_bgra, dst_y, width); + BGRAToYRow(src_bgra + src_stride_bgra, dst_y + dst_stride_y, width); + src_bgra += src_stride_bgra * 2; dst_y += dst_stride_y * 2; dst_u += dst_stride_u; dst_v += dst_stride_v; } if (height & 1) { - ARGBToUVRow(src_frame, 0, dst_u, dst_v, width); - ARGBToYRow(src_frame, dst_y, width); + BGRAToUVRow(src_bgra, 0, dst_u, dst_v, width); + BGRAToYRow(src_bgra, dst_y, width); } return 0; } -int ABGRToI420(const uint8* src_frame, int src_stride_frame, +LIBYUV_API +int ABGRToI420(const uint8* src_abgr, int src_stride_abgr, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int width, int height) { + if (!src_abgr || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. if (height < 0) { height = -height; - src_frame = src_frame + (height - 1) * src_stride_frame; - src_stride_frame = -src_stride_frame; + src_abgr = src_abgr + (height - 1) * src_stride_abgr; + src_stride_abgr = -src_stride_abgr; } - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix); - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int pix); + void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr, uint8* dst_u, uint8* dst_v, int width); + + ABGRToYRow = ABGRToYRow_C; + ABGRToUVRow = ABGRToUVRow_C; #if defined(HAS_ABGRTOYROW_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && - (width % 16 == 0) && - IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && - IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { - ARGBToYRow = ABGRToYRow_SSSE3; - } else -#endif - { - ARGBToYRow = ABGRToYRow_C; - } -#if defined(HAS_ABGRTOUVROW_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && - (width % 16 == 0) && - IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && - IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && - IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) { - ARGBToUVRow = ABGRToUVRow_SSSE3; - } else -#endif - { - ARGBToUVRow = ABGRToUVRow_C; - } - - for (int y = 0; y < (height - 1); y += 2) { - ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width); - ARGBToYRow(src_frame, dst_y, width); - ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width); - src_frame += src_stride_frame * 2; + if (TestCpuFlag(kCpuHasSSSE3)) { + if (width > 16) { + ABGRToUVRow = ABGRToUVRow_Any_SSSE3; + ABGRToYRow = ABGRToYRow_Any_SSSE3; + } + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_Unaligned_SSSE3; + ABGRToYRow = ABGRToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_abgr, 16) && IS_ALIGNED(src_stride_abgr, 16)) { + ABGRToUVRow = ABGRToUVRow_SSSE3; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ABGRToYRow = ABGRToYRow_SSSE3; + } + } + } + } +#endif + + for (int y = 0; y < height - 1; y += 2) { + ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width); + ABGRToYRow(src_abgr, dst_y, width); + ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width); + src_abgr += src_stride_abgr * 2; dst_y += dst_stride_y * 2; dst_u += dst_stride_u; dst_v += dst_stride_v; } if (height & 1) { - ARGBToUVRow(src_frame, 0, dst_u, dst_v, width); - ARGBToYRow(src_frame, dst_y, width); + ABGRToUVRow(src_abgr, 0, dst_u, dst_v, width); + ABGRToYRow(src_abgr, dst_y, width); } return 0; } -int RGB24ToI420(const uint8* src_frame, int src_stride_frame, +LIBYUV_API +int RGBAToI420(const uint8* src_rgba, int src_stride_rgba, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + if (!src_rgba || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgba = src_rgba + (height - 1) * src_stride_rgba; + src_stride_rgba = -src_stride_rgba; + } + void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int pix); + void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int width); + + RGBAToYRow = RGBAToYRow_C; + RGBAToUVRow = RGBAToUVRow_C; +#if defined(HAS_RGBATOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + if (width > 16) { + RGBAToUVRow = RGBAToUVRow_Any_SSSE3; + RGBAToYRow = RGBAToYRow_Any_SSSE3; + } + if (IS_ALIGNED(width, 16)) { + RGBAToUVRow = RGBAToUVRow_Unaligned_SSSE3; + RGBAToYRow = RGBAToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_rgba, 16) && IS_ALIGNED(src_stride_rgba, 16)) { + RGBAToUVRow = RGBAToUVRow_SSSE3; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + RGBAToYRow = RGBAToYRow_SSSE3; + } + } + } + } +#endif + + for (int y = 0; y < height - 1; y += 2) { + RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width); + RGBAToYRow(src_rgba, dst_y, width); + RGBAToYRow(src_rgba + src_stride_rgba, dst_y + dst_stride_y, width); + src_rgba += src_stride_rgba * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + RGBAToUVRow(src_rgba, 0, dst_u, dst_v, width); + RGBAToYRow(src_rgba, dst_y, width); + } + return 0; +} + +LIBYUV_API +int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int width, int height) { + if (width * 4 > kMaxStride) { // Row buffer is required. + return -1; + } else if (!src_rgb24 || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. if (height < 0) { height = -height; - src_frame = src_frame + (height - 1) * src_stride_frame; - src_stride_frame = -src_stride_frame; + src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24; + src_stride_rgb24 = -src_stride_rgb24; } + SIMD_ALIGNED(uint8 row[kMaxStride * 2]); + void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix); + + RGB24ToARGBRow = RGB24ToARGBRow_C; +#if defined(HAS_RGB24TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + TestReadSafe(src_rgb24, src_stride_rgb24, width, height, 3, 48)) { + RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; + } +#endif + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix); void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); -#if defined(HAS_RGB24TOYROW_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && - (width % 16 == 0) && - IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && - IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { - ARGBToYRow = RGB24ToYRow_SSSE3; - } else -#endif - { - ARGBToYRow = RGB24ToYRow_C; - } -#if defined(HAS_RGB24TOUVROW_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && - (width % 16 == 0) && - IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && - IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && - IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) { - ARGBToUVRow = RGB24ToUVRow_SSSE3; - } else -#endif - { - ARGBToUVRow = RGB24ToUVRow_C; - } - - for (int y = 0; y < (height - 1); y += 2) { - ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width); - ARGBToYRow(src_frame, dst_y, width); - ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width); - src_frame += src_stride_frame * 2; + + ARGBToYRow = ARGBToYRow_C; + ARGBToUVRow = ARGBToUVRow_C; +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + if (width > 16) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + } + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } +#endif + + for (int y = 0; y < height - 1; y += 2) { + RGB24ToARGBRow(src_rgb24, row, width); + RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kMaxStride, width); + ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width); + src_rgb24 += src_stride_rgb24 * 2; dst_y += dst_stride_y * 2; dst_u += dst_stride_u; dst_v += dst_stride_v; } if (height & 1) { - ARGBToUVRow(src_frame, 0, dst_u, dst_v, width); - ARGBToYRow(src_frame, dst_y, width); + RGB24ToARGBRow_C(src_rgb24, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); } return 0; } -int RAWToI420(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +LIBYUV_API +int RAWToI420(const uint8* src_raw, int src_stride_raw, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + if (width * 4 > kMaxStride) { // Row buffer is required. + return -1; + } else if (!src_raw || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. if (height < 0) { height = -height; - src_frame = src_frame + (height - 1) * src_stride_frame; - src_stride_frame = -src_stride_frame; + src_raw = src_raw + (height - 1) * src_stride_raw; + src_stride_raw = -src_stride_raw; } + SIMD_ALIGNED(uint8 row[kMaxStride * 2]); + void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix); + + RAWToARGBRow = RAWToARGBRow_C; +#if defined(HAS_RAWTOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + TestReadSafe(src_raw, src_stride_raw, width, height, 3, 48)) { + RAWToARGBRow = RAWToARGBRow_SSSE3; + } +#endif + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix); void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); -#if defined(HAS_RAWTOYROW_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && - (width % 16 == 0) && - IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && - IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { - ARGBToYRow = RAWToYRow_SSSE3; - } else -#endif - { - ARGBToYRow = RAWToYRow_C; - } -#if defined(HAS_RAWTOUVROW_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && - (width % 16 == 0) && - IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && - IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && - IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) { - ARGBToUVRow = RAWToUVRow_SSSE3; - } else -#endif - { - ARGBToUVRow = RAWToUVRow_C; - } - - for (int y = 0; y < (height - 1); y += 2) { - ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width); - ARGBToYRow(src_frame, dst_y, width); - ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width); - src_frame += src_stride_frame * 2; + + ARGBToYRow = ARGBToYRow_C; + ARGBToUVRow = ARGBToUVRow_C; +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + if (width > 16) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + } + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } +#endif + + for (int y = 0; y < height - 1; y += 2) { + RAWToARGBRow(src_raw, row, width); + RAWToARGBRow(src_raw + src_stride_raw, row + kMaxStride, width); + ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width); + src_raw += src_stride_raw * 2; dst_y += dst_stride_y * 2; dst_u += dst_stride_u; dst_v += dst_stride_v; } if (height & 1) { - ARGBToUVRow(src_frame, 0, dst_u, dst_v, width); - ARGBToYRow(src_frame, dst_y, width); + RAWToARGBRow_C(src_raw, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); } return 0; } -} // namespace libyuv +LIBYUV_API +int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + if (width * 4 > kMaxStride) { // Row buffer is required. + return -1; + } else if (!src_rgb565 || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565; + src_stride_rgb565 = -src_stride_rgb565; + } + SIMD_ALIGNED(uint8 row[kMaxStride * 2]); + void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix); + + RGB565ToARGBRow = RGB565ToARGBRow_C; +#if defined(HAS_RGB565TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + TestReadSafe(src_rgb565, src_stride_rgb565, width, height, 2, 16)) { + RGB565ToARGBRow = RGB565ToARGBRow_SSE2; + } +#endif + + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix); + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); + + ARGBToYRow = ARGBToYRow_C; + ARGBToUVRow = ARGBToUVRow_C; +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + if (width > 16) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + } + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } +#endif + + for (int y = 0; y < height - 1; y += 2) { + RGB565ToARGBRow(src_rgb565, row, width); + RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kMaxStride, width); + ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width); + src_rgb565 += src_stride_rgb565 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + RGB565ToARGBRow_C(src_rgb565, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + } + return 0; +} + +LIBYUV_API +int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + if (width * 4 > kMaxStride) { // Row buffer is required. + return -1; + } else if (!src_argb1555 || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555; + src_stride_argb1555 = -src_stride_argb1555; + } + SIMD_ALIGNED(uint8 row[kMaxStride * 2]); + void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix); + + ARGB1555ToARGBRow = ARGB1555ToARGBRow_C; +#if defined(HAS_ARGB1555TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + TestReadSafe(src_argb1555, src_stride_argb1555, width, height, 2, 16)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2; + } +#endif + + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix); + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); + + ARGBToYRow = ARGBToYRow_C; + ARGBToUVRow = ARGBToUVRow_C; +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + if (width > 16) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + } + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } +#endif + + for (int y = 0; y < height - 1; y += 2) { + ARGB1555ToARGBRow(src_argb1555, row, width); + ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, + row + kMaxStride, width); + ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width); + src_argb1555 += src_stride_argb1555 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + ARGB1555ToARGBRow_C(src_argb1555, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + } + return 0; +} + +LIBYUV_API +int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + if (width * 4 > kMaxStride) { // Row buffer is required. + return -1; + } else if (!src_argb4444 || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444; + src_stride_argb4444 = -src_stride_argb4444; + } + SIMD_ALIGNED(uint8 row[kMaxStride * 2]); + void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix); + + ARGB4444ToARGBRow = ARGB4444ToARGBRow_C; +#if defined(HAS_ARGB4444TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + TestReadSafe(src_argb4444, src_stride_argb4444, width, height, 2, 16)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2; + } +#endif + + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix); + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); + + ARGBToYRow = ARGBToYRow_C; + ARGBToUVRow = ARGBToUVRow_C; +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + if (width > 16) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + } + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } +#endif + + for (int y = 0; y < height - 1; y += 2) { + ARGB4444ToARGBRow(src_argb4444, row, width); + ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, + row + kMaxStride, width); + ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width); + src_argb4444 += src_stride_argb4444 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + ARGB4444ToARGBRow_C(src_argb4444, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + } + return 0; +} + +#ifdef HAVE_JPEG +struct I420Buffers { + uint8* y; + int y_stride; + uint8* u; + int u_stride; + uint8* v; + int v_stride; + int w; + int h; +}; + +static void JpegCopyI420(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + I420Buffers* dest = static_cast<I420Buffers*>(opaque); + I420Copy(data[0], strides[0], + data[1], strides[1], + data[2], strides[2], + dest->y, dest->y_stride, + dest->u, dest->u_stride, + dest->v, dest->v_stride, + dest->w, rows); + dest->y += rows * dest->y_stride; + dest->u += ((rows + 1) >> 1) * dest->u_stride; + dest->v += ((rows + 1) >> 1) * dest->v_stride; + dest->h -= rows; +} + +static void JpegI422ToI420(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + I420Buffers* dest = static_cast<I420Buffers*>(opaque); + I422ToI420(data[0], strides[0], + data[1], strides[1], + data[2], strides[2], + dest->y, dest->y_stride, + dest->u, dest->u_stride, + dest->v, dest->v_stride, + dest->w, rows); + dest->y += rows * dest->y_stride; + dest->u += ((rows + 1) >> 1) * dest->u_stride; + dest->v += ((rows + 1) >> 1) * dest->v_stride; + dest->h -= rows; +} + +static void JpegI444ToI420(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + I420Buffers* dest = static_cast<I420Buffers*>(opaque); + I444ToI420(data[0], strides[0], + data[1], strides[1], + data[2], strides[2], + dest->y, dest->y_stride, + dest->u, dest->u_stride, + dest->v, dest->v_stride, + dest->w, rows); + dest->y += rows * dest->y_stride; + dest->u += ((rows + 1) >> 1) * dest->u_stride; + dest->v += ((rows + 1) >> 1) * dest->v_stride; + dest->h -= rows; +} + +static void JpegI411ToI420(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + I420Buffers* dest = static_cast<I420Buffers*>(opaque); + I411ToI420(data[0], strides[0], + data[1], strides[1], + data[2], strides[2], + dest->y, dest->y_stride, + dest->u, dest->u_stride, + dest->v, dest->v_stride, + dest->w, rows); + dest->y += rows * dest->y_stride; + dest->u += ((rows + 1) >> 1) * dest->u_stride; + dest->v += ((rows + 1) >> 1) * dest->v_stride; + dest->h -= rows; +} + +static void JpegI400ToI420(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + I420Buffers* dest = static_cast<I420Buffers*>(opaque); + I400ToI420(data[0], strides[0], + dest->y, dest->y_stride, + dest->u, dest->u_stride, + dest->v, dest->v_stride, + dest->w, rows); + dest->y += rows * dest->y_stride; + dest->u += ((rows + 1) >> 1) * dest->u_stride; + dest->v += ((rows + 1) >> 1) * dest->v_stride; + dest->h -= rows; +} + +// MJPG (Motion JPeg) to I420 +// TODO(fbarchard): review w and h requirement. dw and dh may be enough. +LIBYUV_API +int MJPGToI420(const uint8* sample, + size_t sample_size, + uint8* y, int y_stride, + uint8* u, int u_stride, + uint8* v, int v_stride, + int w, int h, + int dw, int dh) { + if (sample_size == kUnknownDataSize) { + // ERROR: MJPEG frame size unknown + return -1; + } + + // TODO(fbarchard): Port to C + MJpegDecoder mjpeg_decoder; + bool ret = mjpeg_decoder.LoadFrame(sample, sample_size); + if (ret && (mjpeg_decoder.GetWidth() != w || + mjpeg_decoder.GetHeight() != h)) { + // ERROR: MJPEG frame has unexpected dimensions + mjpeg_decoder.UnloadFrame(); + return 1; // runtime failure + } + if (ret) { + I420Buffers bufs = { y, y_stride, u, u_stride, v, v_stride, dw, dh }; + // YUV420 + if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 2 && + mjpeg_decoder.GetHorizSampFactor(0) == 2 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh); + // YUV422 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 2 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh); + // YUV444 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 1 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh); + // YUV411 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 4 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToI420, &bufs, dw, dh); + // YUV400 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceGrayscale && + mjpeg_decoder.GetNumComponents() == 1 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh); + } else { + // TODO(fbarchard): Implement conversion for any other colorspace/sample + // factors that occur in practice. 411 is supported by libjpeg + // ERROR: Unable to convert MJPEG frame because format is not supported + mjpeg_decoder.UnloadFrame(); + return 1; + } + } + return 0; +} +#endif + +// Convert camera sample to I420 with cropping, rotation and vertical flip. +// src_width is used for source stride computation +// src_height is used to compute location of planes, and indicate inversion +// sample_size is measured in bytes and is the size of the frame. +// With MJPEG it is the compressed size of the frame. +LIBYUV_API +int ConvertToI420(const uint8* sample, +#ifdef HAVE_JPEG + size_t sample_size, +#else + size_t /* sample_size */, +#endif + uint8* y, int y_stride, + uint8* u, int u_stride, + uint8* v, int v_stride, + int crop_x, int crop_y, + int src_width, int src_height, + int dst_width, int dst_height, + RotationMode rotation, + uint32 format) { + if (!y || !u || !v || !sample || + src_width <= 0 || dst_width <= 0 || + src_height == 0 || dst_height == 0) { + return -1; + } + int aligned_src_width = (src_width + 1) & ~1; + const uint8* src; + const uint8* src_uv; + int abs_src_height = (src_height < 0) ? -src_height : src_height; + int inv_dst_height = (dst_height < 0) ? -dst_height : dst_height; + if (src_height < 0) { + inv_dst_height = -inv_dst_height; + } + int r = 0; + + // One pass rotation is available for some formats. For the rest, convert + // to I420 (with optional vertical flipping) into a temporary I420 buffer, + // and then rotate the I420 to the final destination buffer. + // For in-place conversion, if destination y is same as source sample, + // also enable temporary buffer. + bool need_buf = (rotation && format != FOURCC_I420 && + format != FOURCC_NV12 && format != FOURCC_NV21 && + format != FOURCC_YU12 && format != FOURCC_YV12) || y == sample; + uint8* tmp_y = y; + uint8* tmp_u = u; + uint8* tmp_v = v; + int tmp_y_stride = y_stride; + int tmp_u_stride = u_stride; + int tmp_v_stride = v_stride; + uint8* buf = NULL; + int abs_dst_height = (dst_height < 0) ? -dst_height : dst_height; + if (need_buf) { + int y_size = dst_width * abs_dst_height; + int uv_size = ((dst_width + 1) / 2) * ((abs_dst_height + 1) / 2); + buf = new uint8[y_size + uv_size * 2]; + if (!buf) { + return 1; // Out of memory runtime error. + } + y = buf; + u = y + y_size; + v = u + uv_size; + y_stride = dst_width; + u_stride = v_stride = ((dst_width + 1) / 2); + } + + switch (format) { + // Single plane formats + case FOURCC_YUY2: + src = sample + (aligned_src_width * crop_y + crop_x) * 2; + r = YUY2ToI420(src, aligned_src_width * 2, + y, y_stride, + u, u_stride, + v, v_stride, + dst_width, inv_dst_height); + break; + case FOURCC_UYVY: + src = sample + (aligned_src_width * crop_y + crop_x) * 2; + r = UYVYToI420(src, aligned_src_width * 2, + y, y_stride, + u, u_stride, + v, v_stride, + dst_width, inv_dst_height); + break; + case FOURCC_V210: + // stride is multiple of 48 pixels (128 bytes). + // pixels come in groups of 6 = 16 bytes + src = sample + (aligned_src_width + 47) / 48 * 128 * crop_y + + crop_x / 6 * 16; + r = V210ToI420(src, (aligned_src_width + 47) / 48 * 128, + y, y_stride, + u, u_stride, + v, v_stride, + dst_width, inv_dst_height); + break; + case FOURCC_24BG: + src = sample + (src_width * crop_y + crop_x) * 3; + r = RGB24ToI420(src, src_width * 3, + y, y_stride, + u, u_stride, + v, v_stride, + dst_width, inv_dst_height); + break; + case FOURCC_RAW: + src = sample + (src_width * crop_y + crop_x) * 3; + r = RAWToI420(src, src_width * 3, + y, y_stride, + u, u_stride, + v, v_stride, + dst_width, inv_dst_height); + break; + case FOURCC_ARGB: + src = sample + (src_width * crop_y + crop_x) * 4; + r = ARGBToI420(src, src_width * 4, + y, y_stride, + u, u_stride, + v, v_stride, + dst_width, inv_dst_height); + break; + case FOURCC_BGRA: + src = sample + (src_width * crop_y + crop_x) * 4; + r = BGRAToI420(src, src_width * 4, + y, y_stride, + u, u_stride, + v, v_stride, + dst_width, inv_dst_height); + break; + case FOURCC_ABGR: + src = sample + (src_width * crop_y + crop_x) * 4; + r = ABGRToI420(src, src_width * 4, + y, y_stride, + u, u_stride, + v, v_stride, + dst_width, inv_dst_height); + break; + case FOURCC_RGBA: + src = sample + (src_width * crop_y + crop_x) * 4; + r = RGBAToI420(src, src_width * 4, + y, y_stride, + u, u_stride, + v, v_stride, + dst_width, inv_dst_height); + break; + case FOURCC_RGBP: + src = sample + (src_width * crop_y + crop_x) * 2; + r = RGB565ToI420(src, src_width * 2, + y, y_stride, + u, u_stride, + v, v_stride, + dst_width, inv_dst_height); + break; + case FOURCC_RGBO: + src = sample + (src_width * crop_y + crop_x) * 2; + r = ARGB1555ToI420(src, src_width * 2, + y, y_stride, + u, u_stride, + v, v_stride, + dst_width, inv_dst_height); + break; + case FOURCC_R444: + src = sample + (src_width * crop_y + crop_x) * 2; + r = ARGB4444ToI420(src, src_width * 2, + y, y_stride, + u, u_stride, + v, v_stride, + dst_width, inv_dst_height); + break; + // TODO(fbarchard): Support cropping Bayer by odd numbers + // by adjusting fourcc. + case FOURCC_BGGR: + src = sample + (src_width * crop_y + crop_x); + r = BayerBGGRToI420(src, src_width, + y, y_stride, + u, u_stride, + v, v_stride, + dst_width, inv_dst_height); + break; + + case FOURCC_GBRG: + src = sample + (src_width * crop_y + crop_x); + r = BayerGBRGToI420(src, src_width, + y, y_stride, + u, u_stride, + v, v_stride, + dst_width, inv_dst_height); + break; + + case FOURCC_GRBG: + src = sample + (src_width * crop_y + crop_x); + r = BayerGRBGToI420(src, src_width, + y, y_stride, + u, u_stride, + v, v_stride, + dst_width, inv_dst_height); + break; + + case FOURCC_RGGB: + src = sample + (src_width * crop_y + crop_x); + r = BayerRGGBToI420(src, src_width, + y, y_stride, + u, u_stride, + v, v_stride, + dst_width, inv_dst_height); + break; + + case FOURCC_I400: + src = sample + src_width * crop_y + crop_x; + r = I400ToI420(src, src_width, + y, y_stride, + u, u_stride, + v, v_stride, + dst_width, inv_dst_height); + break; + + // Biplanar formats + case FOURCC_NV12: + src = sample + (src_width * crop_y + crop_x); + src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x; + r = NV12ToI420Rotate(src, src_width, + src_uv, aligned_src_width, + y, y_stride, + u, u_stride, + v, v_stride, + dst_width, inv_dst_height, rotation); + break; + case FOURCC_NV21: + src = sample + (src_width * crop_y + crop_x); + src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x; + // Call NV12 but with u and v parameters swapped. + r = NV12ToI420Rotate(src, src_width, + src_uv, aligned_src_width, + y, y_stride, + v, v_stride, + u, u_stride, + dst_width, inv_dst_height, rotation); + break; + case FOURCC_M420: + src = sample + (src_width * crop_y) * 12 / 8 + crop_x; + r = M420ToI420(src, src_width, + y, y_stride, + u, u_stride, + v, v_stride, + dst_width, inv_dst_height); + break; + case FOURCC_Q420: + src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x; + src_uv = sample + (src_width + aligned_src_width * 2) * crop_y + + src_width + crop_x * 2; + r = Q420ToI420(src, src_width * 3, + src_uv, src_width * 3, + y, y_stride, + u, u_stride, + v, v_stride, + dst_width, inv_dst_height); + break; + // Triplanar formats + case FOURCC_I420: + case FOURCC_YU12: + case FOURCC_YV12: { + const uint8* src_y = sample + (src_width * crop_y + crop_x); + const uint8* src_u; + const uint8* src_v; + int halfwidth = (src_width + 1) / 2; + int halfheight = (abs_src_height + 1) / 2; + if (format == FOURCC_YV12) { + src_v = sample + src_width * abs_src_height + + (halfwidth * crop_y + crop_x) / 2; + src_u = sample + src_width * abs_src_height + + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + } else { + src_u = sample + src_width * abs_src_height + + (halfwidth * crop_y + crop_x) / 2; + src_v = sample + src_width * abs_src_height + + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + } + r = I420Rotate(src_y, src_width, + src_u, halfwidth, + src_v, halfwidth, + y, y_stride, + u, u_stride, + v, v_stride, + dst_width, inv_dst_height, rotation); + break; + } + case FOURCC_I422: + case FOURCC_YV16: { + const uint8* src_y = sample + src_width * crop_y + crop_x; + const uint8* src_u; + const uint8* src_v; + int halfwidth = (src_width + 1) / 2; + if (format == FOURCC_YV16) { + src_v = sample + src_width * abs_src_height + + halfwidth * crop_y + crop_x / 2; + src_u = sample + src_width * abs_src_height + + halfwidth * (abs_src_height + crop_y) + crop_x / 2; + } else { + src_u = sample + src_width * abs_src_height + + halfwidth * crop_y + crop_x / 2; + src_v = sample + src_width * abs_src_height + + halfwidth * (abs_src_height + crop_y) + crop_x / 2; + } + r = I422ToI420(src_y, src_width, + src_u, halfwidth, + src_v, halfwidth, + y, y_stride, + u, u_stride, + v, v_stride, + dst_width, inv_dst_height); + break; + } + case FOURCC_I444: + case FOURCC_YV24: { + const uint8* src_y = sample + src_width * crop_y + crop_x; + const uint8* src_u; + const uint8* src_v; + if (format == FOURCC_YV24) { + src_v = sample + src_width * (abs_src_height + crop_y) + crop_x; + src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; + } else { + src_u = sample + src_width * (abs_src_height + crop_y) + crop_x; + src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; + } + r = I444ToI420(src_y, src_width, + src_u, src_width, + src_v, src_width, + y, y_stride, + u, u_stride, + v, v_stride, + dst_width, inv_dst_height); + break; + } + case FOURCC_I411: { + int quarterwidth = (src_width + 3) / 4; + const uint8* src_y = sample + src_width * crop_y + crop_x; + const uint8* src_u = sample + src_width * abs_src_height + + quarterwidth * crop_y + crop_x / 4; + const uint8* src_v = sample + src_width * abs_src_height + + quarterwidth * (abs_src_height + crop_y) + crop_x / 4; + r = I411ToI420(src_y, src_width, + src_u, quarterwidth, + src_v, quarterwidth, + y, y_stride, + u, u_stride, + v, v_stride, + dst_width, inv_dst_height); + break; + } +#ifdef HAVE_JPEG + case FOURCC_MJPG: + r = MJPGToI420(sample, sample_size, + y, y_stride, + u, u_stride, + v, v_stride, + src_width, abs_src_height, dst_width, inv_dst_height); + break; +#endif + default: + r = -1; // unknown fourcc - return failure code. + } + + if (need_buf) { + if (!r) { + r = I420Rotate(y, y_stride, + u, u_stride, + v, v_stride, + tmp_y, tmp_y_stride, + tmp_u, tmp_u_stride, + tmp_v, tmp_v_stride, + dst_width, abs_dst_height, rotation); + } + delete buf; + } + + return r; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/files/source/convert_argb.cc b/files/source/convert_argb.cc new file mode 100644 index 00000000..1c5aa9d9 --- /dev/null +++ b/files/source/convert_argb.cc @@ -0,0 +1,1300 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/convert_argb.h" + +#include <string.h> // for memset() + +#include "libyuv/cpu_id.h" +#include "libyuv/format_conversion.h" +#ifdef HAVE_JPEG +#include "libyuv/mjpeg_decoder.h" +#endif +#include "libyuv/rotate_argb.h" +#include "libyuv/video_common.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Copy ARGB with optional flipping +LIBYUV_API +int ARGBCopy(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_argb || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + + CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb, + width * 4, height); + return 0; +} + +// Convert I444 to ARGB. +LIBYUV_API +int I444ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_y || !src_u || !src_v || + !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + void (*I444ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I444ToARGBRow_C; +#if defined(HAS_I444TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I444ToARGBRow = I444ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + I444ToARGBRow = I444ToARGBRow_SSSE3; + } + } + } +#endif + + for (int y = 0; y < height; ++y) { + I444ToARGBRow(src_y, src_u, src_v, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I422 to ARGB. +LIBYUV_API +int I422ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_y || !src_u || !src_v || + !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + void (*I422ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToARGBRow_C; +#if defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } + } +#elif defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } + } + } +#endif + + for (int y = 0; y < height; ++y) { + I422ToARGBRow(src_y, src_u, src_v, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I411 to ARGB. +LIBYUV_API +int I411ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_y || !src_u || !src_v || + !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + void (*I411ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I411ToARGBRow_C; +#if defined(HAS_I411TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I411ToARGBRow = I411ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I411ToARGBRow = I411ToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + I411ToARGBRow = I411ToARGBRow_SSSE3; + } + } + } +#endif + + for (int y = 0; y < height; ++y) { + I411ToARGBRow(src_y, src_u, src_v, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + + +// Convert I400 to ARGB. +LIBYUV_API +int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_y || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + void (*YToARGBRow)(const uint8* y_buf, + uint8* rgb_buf, + int width) = YToARGBRow_C; +#if defined(HAS_YTOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(width, 8) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + YToARGBRow = YToARGBRow_SSE2; + } +#endif + + for (int y = 0; y < height; ++y) { + YToARGBRow(src_y, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + } + return 0; +} + +// Convert I400 to ARGB. +LIBYUV_API +int I400ToARGB(const uint8* src_y, int src_stride_y, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_y || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) = + I400ToARGBRow_C; +#if defined(HAS_I400TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(width, 8) && + IS_ALIGNED(src_y, 8) && IS_ALIGNED(src_stride_y, 8) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + I400ToARGBRow = I400ToARGBRow_SSE2; + } +#endif + + for (int y = 0; y < height; ++y) { + I400ToARGBRow(src_y, dst_argb, width); + src_y += src_stride_y; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert BGRA to ARGB. +LIBYUV_API +int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_bgra || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_bgra = src_bgra + (height - 1) * src_stride_bgra; + src_stride_bgra = -src_stride_bgra; + } + void (*BGRAToARGBRow)(const uint8* src_bgra, uint8* dst_argb, int pix) = + BGRAToARGBRow_C; +#if defined(HAS_BGRATOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + IS_ALIGNED(width, 4) && + IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + BGRAToARGBRow = BGRAToARGBRow_SSSE3; + } +#endif + + for (int y = 0; y < height; ++y) { + BGRAToARGBRow(src_bgra, dst_argb, width); + src_bgra += src_stride_bgra; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert ABGR to ARGB. +LIBYUV_API +int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_abgr || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_abgr = src_abgr + (height - 1) * src_stride_abgr; + src_stride_abgr = -src_stride_abgr; + } + void (*ABGRToARGBRow)(const uint8* src_abgr, uint8* dst_argb, int pix) = + ABGRToARGBRow_C; +#if defined(HAS_ABGRTOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + IS_ALIGNED(width, 4) && + IS_ALIGNED(src_abgr, 16) && IS_ALIGNED(src_stride_abgr, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ABGRToARGBRow = ABGRToARGBRow_SSSE3; + } +#endif + + for (int y = 0; y < height; ++y) { + ABGRToARGBRow(src_abgr, dst_argb, width); + src_abgr += src_stride_abgr; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert RGBA to ARGB. +LIBYUV_API +int RGBAToARGB(const uint8* src_rgba, int src_stride_rgba, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_rgba || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgba = src_rgba + (height - 1) * src_stride_rgba; + src_stride_rgba = -src_stride_rgba; + } + void (*RGBAToARGBRow)(const uint8* src_rgba, uint8* dst_argb, int pix) = + RGBAToARGBRow_C; +#if defined(HAS_RGBATOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + IS_ALIGNED(width, 4) && + IS_ALIGNED(src_rgba, 16) && IS_ALIGNED(src_stride_rgba, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + RGBAToARGBRow = RGBAToARGBRow_SSSE3; + } +#endif + + for (int y = 0; y < height; ++y) { + RGBAToARGBRow(src_rgba, dst_argb, width); + src_rgba += src_stride_rgba; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert RAW to ARGB. +LIBYUV_API +int RAWToARGB(const uint8* src_raw, int src_stride_raw, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_raw || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_raw = src_raw + (height - 1) * src_stride_raw; + src_stride_raw = -src_stride_raw; + } + void (*RAWToARGBRow)(const uint8* src_raw, uint8* dst_argb, int pix) = + RAWToARGBRow_C; +#if defined(HAS_RAWTOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + IS_ALIGNED(width, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + RAWToARGBRow = RAWToARGBRow_SSSE3; + } +#endif + + for (int y = 0; y < height; ++y) { + RAWToARGBRow(src_raw, dst_argb, width); + src_raw += src_stride_raw; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert RGB24 to ARGB. +LIBYUV_API +int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_rgb24 || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24; + src_stride_rgb24 = -src_stride_rgb24; + } + void (*RGB24ToARGBRow)(const uint8* src_rgb24, uint8* dst_argb, int pix) = + RGB24ToARGBRow_C; +#if defined(HAS_RGB24TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + IS_ALIGNED(width, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; + } +#endif + + for (int y = 0; y < height; ++y) { + RGB24ToARGBRow(src_rgb24, dst_argb, width); + src_rgb24 += src_stride_rgb24; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert RGB565 to ARGB. +LIBYUV_API +int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_rgb565 || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565; + src_stride_rgb565 = -src_stride_rgb565; + } + void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int pix) = + RGB565ToARGBRow_C; +#if defined(HAS_RGB565TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(width, 8) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + RGB565ToARGBRow = RGB565ToARGBRow_SSE2; + } +#endif + + for (int y = 0; y < height; ++y) { + RGB565ToARGBRow(src_rgb565, dst_argb, width); + src_rgb565 += src_stride_rgb565; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert ARGB1555 to ARGB. +LIBYUV_API +int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_argb1555 || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555; + src_stride_argb1555 = -src_stride_argb1555; + } + void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb, + int pix) = ARGB1555ToARGBRow_C; +#if defined(HAS_ARGB1555TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(width, 8) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2; + } +#endif + + for (int y = 0; y < height; ++y) { + ARGB1555ToARGBRow(src_argb1555, dst_argb, width); + src_argb1555 += src_stride_argb1555; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert ARGB4444 to ARGB. +LIBYUV_API +int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_argb4444 || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444; + src_stride_argb4444 = -src_stride_argb4444; + } + void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb, + int pix) = ARGB4444ToARGBRow_C; +#if defined(HAS_ARGB4444TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(width, 8) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2; + } +#endif + + for (int y = 0; y < height; ++y) { + ARGB4444ToARGBRow(src_argb4444, dst_argb, width); + src_argb4444 += src_stride_argb4444; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert NV12 to ARGB. +LIBYUV_API +int NV12ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_y || !src_uv || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + void (*NV12ToARGBRow)(const uint8* y_buf, + const uint8* uv_buf, + uint8* rgb_buf, + int width) = NV12ToARGBRow_C; +#if defined(HAS_NV12TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + NV12ToARGBRow = NV12ToARGBRow_SSSE3; + } + } + } +#endif +#if defined(HAS_NV12TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + NV12ToARGBRow = NV12ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_NEON; + } + } +#endif + + for (int y = 0; y < height; ++y) { + NV12ToARGBRow(src_y, src_uv, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_uv += src_stride_uv; + } + } + return 0; +} + +// Convert NV21 to ARGB. +LIBYUV_API +int NV21ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_y || !src_uv || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + void (*NV21ToARGBRow)(const uint8* y_buf, + const uint8* uv_buf, + uint8* rgb_buf, + int width) = NV21ToARGBRow_C; +#if defined(HAS_NV21TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + NV21ToARGBRow = NV21ToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + NV21ToARGBRow = NV21ToARGBRow_SSSE3; + } + } + } +#endif +#if defined(HAS_NV21TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + NV21ToARGBRow = NV21ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV21ToARGBRow = NV21ToARGBRow_NEON; + } + } +#endif + + for (int y = 0; y < height; ++y) { + NV21ToARGBRow(src_y, src_uv, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_uv += src_stride_uv; + } + } + return 0; +} + +// Convert M420 to ARGB. +LIBYUV_API +int M420ToARGB(const uint8* src_m420, int src_stride_m420, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_m420 || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + void (*NV12ToARGBRow)(const uint8* y_buf, + const uint8* uv_buf, + uint8* rgb_buf, + int width) = NV12ToARGBRow_C; +#if defined(HAS_NV12TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + NV12ToARGBRow = NV12ToARGBRow_SSSE3; + } + } + } +#endif + + for (int y = 0; y < height - 1; y += 2) { + NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width); + NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2, + dst_argb + dst_stride_argb, width); + dst_argb += dst_stride_argb * 2; + src_m420 += src_stride_m420 * 3; + } + if (height & 1) { + NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width); + } + return 0; +} + +// Convert YUY2 to ARGB. +LIBYUV_API +int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_yuy2 || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; + src_stride_yuy2 = -src_stride_yuy2; + } + void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, + int pix) = YUY2ToUV422Row_C; + void (*YUY2ToYRow)(const uint8* src_yuy2, + uint8* dst_y, int pix) = YUY2ToYRow_C; +#if defined(HAS_YUY2TOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + if (width > 16) { + YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2; + YUY2ToYRow = YUY2ToYRow_Any_SSE2; + } + if (IS_ALIGNED(width, 16)) { + YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2; + YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2; + if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) { + YUY2ToUV422Row = YUY2ToUV422Row_SSE2; + YUY2ToYRow = YUY2ToYRow_SSE2; + } + } + } +#elif defined(HAS_YUY2TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + if (width > 8) { + YUY2ToYRow = YUY2ToYRow_Any_NEON; + if (width > 16) { + YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON; + } + } + if (IS_ALIGNED(width, 8)) { + YUY2ToYRow = YUY2ToYRow_NEON; + if (IS_ALIGNED(width, 16)) { + YUY2ToUV422Row = YUY2ToUV422Row_NEON; + } + } + } +#endif + + void (*I422ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width) = I422ToARGBRow_C; +#if defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } + } +#elif defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } + } +#endif + + SIMD_ALIGNED(uint8 rowy[kMaxStride]); + SIMD_ALIGNED(uint8 rowu[kMaxStride]); + SIMD_ALIGNED(uint8 rowv[kMaxStride]); + + for (int y = 0; y < height; ++y) { + YUY2ToUV422Row(src_yuy2, rowu, rowv, width); + YUY2ToYRow(src_yuy2, rowy, width); + I422ToARGBRow(rowy, rowu, rowv, dst_argb, width); + src_yuy2 += src_stride_yuy2; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert UYVY to ARGB. +LIBYUV_API +int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_uyvy || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; + src_stride_uyvy = -src_stride_uyvy; + } + void (*UYVYToUV422Row)(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, + int pix) = UYVYToUV422Row_C; + void (*UYVYToYRow)(const uint8* src_uyvy, + uint8* dst_y, int pix) = UYVYToYRow_C; +#if defined(HAS_UYVYTOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + if (width > 16) { + UYVYToUV422Row = UYVYToUV422Row_Any_SSE2; + UYVYToYRow = UYVYToYRow_Any_SSE2; + } + if (IS_ALIGNED(width, 16)) { + UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2; + UYVYToYRow = UYVYToYRow_Unaligned_SSE2; + if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) { + UYVYToUV422Row = UYVYToUV422Row_SSE2; + UYVYToYRow = UYVYToYRow_SSE2; + } + } + } +#endif + void (*I422ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width) = I422ToARGBRow_C; +#if defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } + } +#elif defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } + } +#endif + + SIMD_ALIGNED(uint8 rowy[kMaxStride]); + SIMD_ALIGNED(uint8 rowu[kMaxStride]); + SIMD_ALIGNED(uint8 rowv[kMaxStride]); + + for (int y = 0; y < height; ++y) { + UYVYToUV422Row(src_uyvy, rowu, rowv, width); + UYVYToYRow(src_uyvy, rowy, width); + I422ToARGBRow(rowy, rowu, rowv, dst_argb, width); + src_uyvy += src_stride_uyvy; + dst_argb += dst_stride_argb; + } + return 0; +} + +#ifdef HAVE_JPEG +struct ARGBBuffers { + uint8* argb; + int argb_stride; + int w; + int h; +}; + +static void JpegI420ToARGB(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + ARGBBuffers* dest = static_cast<ARGBBuffers*>(opaque); + I420ToARGB(data[0], strides[0], + data[1], strides[1], + data[2], strides[2], + dest->argb, dest->argb_stride, + dest->w, rows); + dest->argb += rows * dest->argb_stride; + dest->h -= rows; +} + +static void JpegI422ToARGB(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + ARGBBuffers* dest = static_cast<ARGBBuffers*>(opaque); + I422ToARGB(data[0], strides[0], + data[1], strides[1], + data[2], strides[2], + dest->argb, dest->argb_stride, + dest->w, rows); + dest->argb += rows * dest->argb_stride; + dest->h -= rows; +} + +static void JpegI444ToARGB(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + ARGBBuffers* dest = static_cast<ARGBBuffers*>(opaque); + I444ToARGB(data[0], strides[0], + data[1], strides[1], + data[2], strides[2], + dest->argb, dest->argb_stride, + dest->w, rows); + dest->argb += rows * dest->argb_stride; + dest->h -= rows; +} + +static void JpegI411ToARGB(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + ARGBBuffers* dest = static_cast<ARGBBuffers*>(opaque); + I411ToARGB(data[0], strides[0], + data[1], strides[1], + data[2], strides[2], + dest->argb, dest->argb_stride, + dest->w, rows); + dest->argb += rows * dest->argb_stride; + dest->h -= rows; +} + +static void JpegI400ToARGB(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + ARGBBuffers* dest = static_cast<ARGBBuffers*>(opaque); + I400ToARGB(data[0], strides[0], + dest->argb, dest->argb_stride, + dest->w, rows); + dest->argb += rows * dest->argb_stride; + dest->h -= rows; +} + +// MJPG (Motion JPeg) to ARGB +// TODO(fbarchard): review w and h requirement. dw and dh may be enough. +LIBYUV_API +int MJPGToARGB(const uint8* sample, + size_t sample_size, + uint8* argb, int argb_stride, + int w, int h, + int dw, int dh) { + if (sample_size == kUnknownDataSize) { + // ERROR: MJPEG frame size unknown + return -1; + } + + // TODO(fbarchard): Port to C + MJpegDecoder mjpeg_decoder; + bool ret = mjpeg_decoder.LoadFrame(sample, sample_size); + if (ret && (mjpeg_decoder.GetWidth() != w || + mjpeg_decoder.GetHeight() != h)) { + // ERROR: MJPEG frame has unexpected dimensions + mjpeg_decoder.UnloadFrame(); + return 1; // runtime failure + } + if (ret) { + ARGBBuffers bufs = { argb, argb_stride, dw, dh }; + // YUV420 + if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 2 && + mjpeg_decoder.GetHorizSampFactor(0) == 2 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh); + // YUV422 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 2 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh); + // YUV444 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 1 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh); + // YUV411 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 4 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToARGB, &bufs, dw, dh); + // YUV400 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceGrayscale && + mjpeg_decoder.GetNumComponents() == 1 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh); + } else { + // TODO(fbarchard): Implement conversion for any other colorspace/sample + // factors that occur in practice. 411 is supported by libjpeg + // ERROR: Unable to convert MJPEG frame because format is not supported + mjpeg_decoder.UnloadFrame(); + return 1; + } + } + return 0; +} +#endif + +// Convert camera sample to I420 with cropping, rotation and vertical flip. +// src_width is used for source stride computation +// src_height is used to compute location of planes, and indicate inversion +// sample_size is measured in bytes and is the size of the frame. +// With MJPEG it is the compressed size of the frame. +LIBYUV_API +int ConvertToARGB(const uint8* sample, size_t sample_size, + uint8* dst_argb, int argb_stride, + int crop_x, int crop_y, + int src_width, int src_height, + int dst_width, int dst_height, + RotationMode rotation, + uint32 format) { + if (dst_argb == NULL || sample == NULL || + src_width <= 0 || dst_width <= 0 || + src_height == 0 || dst_height == 0) { + return -1; + } + int aligned_src_width = (src_width + 1) & ~1; + const uint8* src; + const uint8* src_uv; + int abs_src_height = (src_height < 0) ? -src_height : src_height; + int inv_dst_height = (dst_height < 0) ? -dst_height : dst_height; + if (src_height < 0) { + inv_dst_height = -inv_dst_height; + } + int r = 0; + + // One pass rotation is available for some formats. For the rest, convert + // to I420 (with optional vertical flipping) into a temporary I420 buffer, + // and then rotate the I420 to the final destination buffer. + // For in-place conversion, if destination dst_argb is same as source sample, + // also enable temporary buffer. + bool need_buf = (rotation && format != FOURCC_ARGB) || dst_argb == sample; + uint8* tmp_argb = dst_argb; + int tmp_argb_stride = argb_stride; + uint8* buf = NULL; + int abs_dst_height = (dst_height < 0) ? -dst_height : dst_height; + if (need_buf) { + int argb_size = dst_width * abs_dst_height * 4; + buf = new uint8[argb_size]; + if (!buf) { + return 1; // Out of memory runtime error. + } + dst_argb = buf; + argb_stride = dst_width; + } + + switch (format) { + // Single plane formats + case FOURCC_YUY2: + src = sample + (aligned_src_width * crop_y + crop_x) * 2; + r = YUY2ToARGB(src, aligned_src_width * 2, + dst_argb, argb_stride, + dst_width, inv_dst_height); + break; + case FOURCC_UYVY: + src = sample + (aligned_src_width * crop_y + crop_x) * 2; + r = UYVYToARGB(src, aligned_src_width * 2, + dst_argb, argb_stride, + dst_width, inv_dst_height); + break; +// case FOURCC_V210: + // stride is multiple of 48 pixels (128 bytes). + // pixels come in groups of 6 = 16 bytes +// src = sample + (aligned_src_width + 47) / 48 * 128 * crop_y + +// crop_x / 6 * 16; +// r = V210ToARGB(src, (aligned_src_width + 47) / 48 * 128, +// dst_argb, argb_stride, +// dst_width, inv_dst_height); +// break; + case FOURCC_24BG: + src = sample + (src_width * crop_y + crop_x) * 3; + r = RGB24ToARGB(src, src_width * 3, + dst_argb, argb_stride, + dst_width, inv_dst_height); + break; + case FOURCC_RAW: + src = sample + (src_width * crop_y + crop_x) * 3; + r = RAWToARGB(src, src_width * 3, + dst_argb, argb_stride, + dst_width, inv_dst_height); + break; + case FOURCC_ARGB: + src = sample + (src_width * crop_y + crop_x) * 4; + r = ARGBToARGB(src, src_width * 4, + dst_argb, argb_stride, + dst_width, inv_dst_height); + break; + case FOURCC_BGRA: + src = sample + (src_width * crop_y + crop_x) * 4; + r = BGRAToARGB(src, src_width * 4, + dst_argb, argb_stride, + dst_width, inv_dst_height); + break; + case FOURCC_ABGR: + src = sample + (src_width * crop_y + crop_x) * 4; + r = ABGRToARGB(src, src_width * 4, + dst_argb, argb_stride, + dst_width, inv_dst_height); + break; + case FOURCC_RGBA: + src = sample + (src_width * crop_y + crop_x) * 4; + r = RGBAToARGB(src, src_width * 4, + dst_argb, argb_stride, + dst_width, inv_dst_height); + break; + case FOURCC_RGBP: + src = sample + (src_width * crop_y + crop_x) * 2; + r = RGB565ToARGB(src, src_width * 2, + dst_argb, argb_stride, + dst_width, inv_dst_height); + break; + case FOURCC_RGBO: + src = sample + (src_width * crop_y + crop_x) * 2; + r = ARGB1555ToARGB(src, src_width * 2, + dst_argb, argb_stride, + dst_width, inv_dst_height); + break; + case FOURCC_R444: + src = sample + (src_width * crop_y + crop_x) * 2; + r = ARGB4444ToARGB(src, src_width * 2, + dst_argb, argb_stride, + dst_width, inv_dst_height); + break; + // TODO(fbarchard): Support cropping Bayer by odd numbers + // by adjusting fourcc. + case FOURCC_BGGR: + src = sample + (src_width * crop_y + crop_x); + r = BayerBGGRToARGB(src, src_width, + dst_argb, argb_stride, + dst_width, inv_dst_height); + break; + + case FOURCC_GBRG: + src = sample + (src_width * crop_y + crop_x); + r = BayerGBRGToARGB(src, src_width, + dst_argb, argb_stride, + dst_width, inv_dst_height); + break; + + case FOURCC_GRBG: + src = sample + (src_width * crop_y + crop_x); + r = BayerGRBGToARGB(src, src_width, + dst_argb, argb_stride, + dst_width, inv_dst_height); + break; + + case FOURCC_RGGB: + src = sample + (src_width * crop_y + crop_x); + r = BayerRGGBToARGB(src, src_width, + dst_argb, argb_stride, + dst_width, inv_dst_height); + break; + + case FOURCC_I400: + src = sample + src_width * crop_y + crop_x; + r = I400ToARGB(src, src_width, + dst_argb, argb_stride, + dst_width, inv_dst_height); + break; + + // Biplanar formats + case FOURCC_NV12: + src = sample + (src_width * crop_y + crop_x); + src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x; + r = NV12ToARGB(src, src_width, + src_uv, aligned_src_width, + dst_argb, argb_stride, + dst_width, inv_dst_height); + break; + case FOURCC_NV21: + src = sample + (src_width * crop_y + crop_x); + src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x; + // Call NV12 but with u and v parameters swapped. + r = NV21ToARGB(src, src_width, + src_uv, aligned_src_width, + dst_argb, argb_stride, + dst_width, inv_dst_height); + break; + case FOURCC_M420: + src = sample + (src_width * crop_y) * 12 / 8 + crop_x; + r = M420ToARGB(src, src_width, + dst_argb, argb_stride, + dst_width, inv_dst_height); + break; +// case FOURCC_Q420: +// src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x; +// src_uv = sample + (src_width + aligned_src_width * 2) * crop_y + +// src_width + crop_x * 2; +// r = Q420ToARGB(src, src_width * 3, +// src_uv, src_width * 3, +// dst_argb, argb_stride, +// dst_width, inv_dst_height); +// break; + // Triplanar formats + case FOURCC_I420: + case FOURCC_YU12: + case FOURCC_YV12: { + const uint8* src_y = sample + (src_width * crop_y + crop_x); + const uint8* src_u; + const uint8* src_v; + int halfwidth = (src_width + 1) / 2; + int halfheight = (abs_src_height + 1) / 2; + if (format == FOURCC_YV12) { + src_v = sample + src_width * abs_src_height + + (halfwidth * crop_y + crop_x) / 2; + src_u = sample + src_width * abs_src_height + + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + } else { + src_u = sample + src_width * abs_src_height + + (halfwidth * crop_y + crop_x) / 2; + src_v = sample + src_width * abs_src_height + + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + } + r = I420ToARGB(src_y, src_width, + src_u, halfwidth, + src_v, halfwidth, + dst_argb, argb_stride, + dst_width, inv_dst_height); + break; + } + case FOURCC_I422: + case FOURCC_YV16: { + const uint8* src_y = sample + src_width * crop_y + crop_x; + const uint8* src_u; + const uint8* src_v; + int halfwidth = (src_width + 1) / 2; + if (format == FOURCC_YV16) { + src_v = sample + src_width * abs_src_height + + halfwidth * crop_y + crop_x / 2; + src_u = sample + src_width * abs_src_height + + halfwidth * (abs_src_height + crop_y) + crop_x / 2; + } else { + src_u = sample + src_width * abs_src_height + + halfwidth * crop_y + crop_x / 2; + src_v = sample + src_width * abs_src_height + + halfwidth * (abs_src_height + crop_y) + crop_x / 2; + } + r = I422ToARGB(src_y, src_width, + src_u, halfwidth, + src_v, halfwidth, + dst_argb, argb_stride, + dst_width, inv_dst_height); + break; + } + case FOURCC_I444: + case FOURCC_YV24: { + const uint8* src_y = sample + src_width * crop_y + crop_x; + const uint8* src_u; + const uint8* src_v; + if (format == FOURCC_YV24) { + src_v = sample + src_width * (abs_src_height + crop_y) + crop_x; + src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; + } else { + src_u = sample + src_width * (abs_src_height + crop_y) + crop_x; + src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; + } + r = I444ToARGB(src_y, src_width, + src_u, src_width, + src_v, src_width, + dst_argb, argb_stride, + dst_width, inv_dst_height); + break; + } + case FOURCC_I411: { + int quarterwidth = (src_width + 3) / 4; + const uint8* src_y = sample + src_width * crop_y + crop_x; + const uint8* src_u = sample + src_width * abs_src_height + + quarterwidth * crop_y + crop_x / 4; + const uint8* src_v = sample + src_width * abs_src_height + + quarterwidth * (abs_src_height + crop_y) + crop_x / 4; + r = I411ToARGB(src_y, src_width, + src_u, quarterwidth, + src_v, quarterwidth, + dst_argb, argb_stride, + dst_width, inv_dst_height); + break; + } +#ifdef HAVE_JPEG + case FOURCC_MJPG: + r = MJPGToARGB(sample, sample_size, + dst_argb, argb_stride, + src_width, abs_src_height, dst_width, inv_dst_height); + break; +#endif + default: + r = -1; // unknown fourcc - return failure code. + } + + if (need_buf) { + if (!r) { + r = ARGBRotate(dst_argb, argb_stride, + tmp_argb, tmp_argb_stride, + dst_width, abs_dst_height, rotation); + } + delete buf; + } + + return r; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/files/source/convert_from.cc b/files/source/convert_from.cc new file mode 100644 index 00000000..4ea974ac --- /dev/null +++ b/files/source/convert_from.cc @@ -0,0 +1,1425 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/convert_from.h" + +#include "libyuv/basic_types.h" +#include "libyuv/convert.h" // For I420Copy +#include "libyuv/cpu_id.h" +#include "libyuv/format_conversion.h" +#include "libyuv/planar_functions.h" +#include "libyuv/rotate.h" +#include "libyuv/video_common.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +LIBYUV_API +int I420ToI422(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + if (!src_y || !src_u || !src_v || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_u = dst_u + (height - 1) * dst_stride_u; + dst_v = dst_v + (height - 1) * dst_stride_v; + dst_stride_y = -dst_stride_y; + dst_stride_u = -dst_stride_u; + dst_stride_v = -dst_stride_v; + } + int halfwidth = (width + 1) >> 1; + void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 64)) { + CopyRow = CopyRow_NEON; + } +#elif defined(HAS_COPYROW_X86) + if (IS_ALIGNED(halfwidth, 4)) { + CopyRow = CopyRow_X86; +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(halfwidth, 32) && + IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) && + IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) && + IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) && + IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) { + CopyRow = CopyRow_SSE2; + } +#endif + } +#endif + + // Copy Y plane + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + + // UpSample U plane. + int y; + for (y = 0; y < height - 1; y += 2) { + CopyRow(src_u, dst_u, halfwidth); + CopyRow(src_u, dst_u + dst_stride_u, halfwidth); + src_u += src_stride_u; + dst_u += dst_stride_u * 2; + } + if (height & 1) { + CopyRow(src_u, dst_u, halfwidth); + } + + // UpSample V plane. + for (y = 0; y < height - 1; y += 2) { + CopyRow(src_v, dst_v, halfwidth); + CopyRow(src_v, dst_v + dst_stride_v, halfwidth); + src_v += src_stride_v; + dst_v += dst_stride_v * 2; + } + if (height & 1) { + CopyRow(src_v, dst_v, halfwidth); + } + return 0; +} + +// use Bilinear for upsampling chroma +void ScalePlaneBilinear(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr); + +LIBYUV_API +int I420ToI444(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + if (!src_y || !src_u|| !src_v || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_u = dst_u + (height - 1) * dst_stride_u; + dst_v = dst_v + (height - 1) * dst_stride_v; + dst_stride_y = -dst_stride_y; + dst_stride_u = -dst_stride_u; + dst_stride_v = -dst_stride_v; + } + + // Copy Y plane + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + + // Upsample U plane. + ScalePlaneBilinear(halfwidth, halfheight, + width, height, + src_stride_u, + dst_stride_u, + src_u, dst_u); + + // Upsample V plane. + ScalePlaneBilinear(halfwidth, halfheight, + width, height, + src_stride_v, + dst_stride_v, + src_v, dst_v); + return 0; +} + +// 420 chroma is 1/2 width, 1/2 height +// 411 chroma is 1/4 width, 1x height +LIBYUV_API +int I420ToI411(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + if (!src_y || !src_u || !src_v || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_u = dst_u + (height - 1) * dst_stride_u; + dst_v = dst_v + (height - 1) * dst_stride_v; + dst_stride_y = -dst_stride_y; + dst_stride_u = -dst_stride_u; + dst_stride_v = -dst_stride_v; + } + + // Copy Y plane + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + int quarterwidth = (width + 3) >> 2; + + // Resample U plane. + ScalePlaneBilinear(halfwidth, halfheight, // from 1/2 width, 1/2 height + quarterwidth, height, // to 1/4 width, 1x height + src_stride_u, + dst_stride_u, + src_u, dst_u); + + // Resample V plane. + ScalePlaneBilinear(halfwidth, halfheight, // from 1/2 width, 1/2 height + quarterwidth, height, // to 1/4 width, 1x height + src_stride_v, + dst_stride_v, + src_v, dst_v); + return 0; +} + +// Copy to I400. Source can be I420,422,444,400,NV12,NV21 +LIBYUV_API +int I400Copy(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + int width, int height) { + if (!src_y || !dst_y || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + return 0; +} + +// YUY2 - Macro-pixel = 2 image pixels +// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... + +// UYVY - Macro-pixel = 2 image pixels +// U0Y0V0Y1 + +#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) +#define HAS_I42XTOYUY2ROW_SSE2 +__declspec(naked) __declspec(align(16)) +static void I42xToYUY2Row_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_y + mov esi, [esp + 8 + 8] // src_u + mov edx, [esp + 8 + 12] // src_v + mov edi, [esp + 8 + 16] // dst_frame + mov ecx, [esp + 8 + 20] // width + sub edx, esi + + align 16 + convertloop: + movq xmm2, qword ptr [esi] // U + movq xmm3, qword ptr [esi + edx] // V + lea esi, [esi + 8] + punpcklbw xmm2, xmm3 // UV + movdqa xmm0, [eax] // Y + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm2 // YUYV + punpckhbw xmm1, xmm2 + movdqa [edi], xmm0 + movdqa [edi + 16], xmm1 + lea edi, [edi + 32] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +#define HAS_I42XTOUYVYROW_SSE2 +__declspec(naked) __declspec(align(16)) +static void I42xToUYVYRow_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_y + mov esi, [esp + 8 + 8] // src_u + mov edx, [esp + 8 + 12] // src_v + mov edi, [esp + 8 + 16] // dst_frame + mov ecx, [esp + 8 + 20] // width + sub edx, esi + + align 16 + convertloop: + movq xmm2, qword ptr [esi] // U + movq xmm3, qword ptr [esi + edx] // V + lea esi, [esi + 8] + punpcklbw xmm2, xmm3 // UV + movdqa xmm0, [eax] // Y + movdqa xmm1, xmm2 + lea eax, [eax + 16] + punpcklbw xmm1, xmm0 // UYVY + punpckhbw xmm2, xmm0 + movdqa [edi], xmm1 + movdqa [edi + 16], xmm2 + lea edi, [edi + 32] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} +#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__)) +#define HAS_I42XTOYUY2ROW_SSE2 +static void I42xToYUY2Row_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width) { + asm volatile ( + "sub %1,%2 \n" + ".p2align 4 \n" + "1: \n" + "movq (%1),%%xmm2 \n" + "movq (%1,%2,1),%%xmm3 \n" + "lea 0x8(%1),%1 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "movdqa (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,(%3) \n" + "movdqa %%xmm1,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_frame), // %3 + "+rm"(width) // %4 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3" +#endif + ); +} + +#define HAS_I42XTOUYVYROW_SSE2 +static void I42xToUYVYRow_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width) { + asm volatile ( + "sub %1,%2 \n" + ".p2align 4 \n" + "1: \n" + "movq (%1),%%xmm2 \n" + "movq (%1,%2,1),%%xmm3 \n" + "lea 0x8(%1),%1 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "movdqa (%0),%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,(%3) \n" + "movdqa %%xmm2,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_frame), // %3 + "+rm"(width) // %4 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3" +#endif + ); +} +#endif + +static void I42xToYUY2Row_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width) { + for (int x = 0; x < width - 1; x += 2) { + dst_frame[0] = src_y[0]; + dst_frame[1] = src_u[0]; + dst_frame[2] = src_y[1]; + dst_frame[3] = src_v[0]; + dst_frame += 4; + src_y += 2; + src_u += 1; + src_v += 1; + } + if (width & 1) { + dst_frame[0] = src_y[0]; + dst_frame[1] = src_u[0]; + dst_frame[2] = src_y[0]; // duplicate last y + dst_frame[3] = src_v[0]; + } +} + +static void I42xToUYVYRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width) { + for (int x = 0; x < width - 1; x += 2) { + dst_frame[0] = src_u[0]; + dst_frame[1] = src_y[0]; + dst_frame[2] = src_v[0]; + dst_frame[3] = src_y[1]; + dst_frame += 4; + src_y += 2; + src_u += 1; + src_v += 1; + } + if (width & 1) { + dst_frame[0] = src_u[0]; + dst_frame[1] = src_y[0]; + dst_frame[2] = src_v[0]; + dst_frame[3] = src_y[0]; // duplicate last y + } +} + +// Visual C x86 or GCC little endian. +#if defined(__x86_64__) || defined(_M_X64) || \ + defined(__i386__) || defined(_M_IX86) || \ + defined(__arm__) || defined(_M_ARM) || \ + (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +#define LIBYUV_LITTLE_ENDIAN +#endif + +#ifdef LIBYUV_LITTLE_ENDIAN +#define WRITEWORD(p, v) *reinterpret_cast<uint32*>(p) = v +#else +static inline void WRITEWORD(uint8* p, uint32 v) { + p[0] = (uint8)(v & 255); + p[1] = (uint8)((v >> 8) & 255); + p[2] = (uint8)((v >> 16) & 255); + p[3] = (uint8)((v >> 24) & 255); +} +#endif + +#define EIGHTTOTEN(x) (x << 2 | x >> 6) +static void UYVYToV210Row_C(const uint8* src_uyvy, uint8* dst_v210, int width) { + for (int x = 0; x < width; x += 6) { + WRITEWORD(dst_v210 + 0, (EIGHTTOTEN(src_uyvy[0])) | + (EIGHTTOTEN(src_uyvy[1]) << 10) | + (EIGHTTOTEN(src_uyvy[2]) << 20)); + WRITEWORD(dst_v210 + 4, (EIGHTTOTEN(src_uyvy[3])) | + (EIGHTTOTEN(src_uyvy[4]) << 10) | + (EIGHTTOTEN(src_uyvy[5]) << 20)); + WRITEWORD(dst_v210 + 8, (EIGHTTOTEN(src_uyvy[6])) | + (EIGHTTOTEN(src_uyvy[7]) << 10) | + (EIGHTTOTEN(src_uyvy[8]) << 20)); + WRITEWORD(dst_v210 + 12, (EIGHTTOTEN(src_uyvy[9])) | + (EIGHTTOTEN(src_uyvy[10]) << 10) | + (EIGHTTOTEN(src_uyvy[11]) << 20)); + src_uyvy += 12; + dst_v210 += 16; + } +} + +// TODO(fbarchard): Deprecate, move or expand 422 support? +LIBYUV_API +int I422ToYUY2(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height) { + if (!src_y || !src_u || !src_v || !dst_frame || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_frame = dst_frame + (height - 1) * dst_stride_frame; + dst_stride_frame = -dst_stride_frame; + } + void (*I42xToYUY2Row)(const uint8* src_y, const uint8* src_u, + const uint8* src_v, uint8* dst_frame, int width) = + I42xToYUY2Row_C; +#if defined(HAS_I42XTOYUY2ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(width, 16) && + IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && + IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) { + I42xToYUY2Row = I42xToYUY2Row_SSE2; + } +#endif + + for (int y = 0; y < height; ++y) { + I42xToYUY2Row(src_y, src_u, src_y, dst_frame, width); + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + dst_frame += dst_stride_frame; + } + return 0; +} + +LIBYUV_API +int I420ToYUY2(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height) { + if (!src_y || !src_u || !src_v || !dst_frame || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_frame = dst_frame + (height - 1) * dst_stride_frame; + dst_stride_frame = -dst_stride_frame; + } + void (*I42xToYUY2Row)(const uint8* src_y, const uint8* src_u, + const uint8* src_v, uint8* dst_frame, int width) = + I42xToYUY2Row_C; +#if defined(HAS_I42XTOYUY2ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(width, 16) && + IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && + IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) { + I42xToYUY2Row = I42xToYUY2Row_SSE2; + } +#endif + + for (int y = 0; y < height - 1; y += 2) { + I42xToYUY2Row(src_y, src_u, src_v, dst_frame, width); + I42xToYUY2Row(src_y + src_stride_y, src_u, src_v, + dst_frame + dst_stride_frame, width); + src_y += src_stride_y * 2; + src_u += src_stride_u; + src_v += src_stride_v; + dst_frame += dst_stride_frame * 2; + } + if (height & 1) { + I42xToYUY2Row(src_y, src_u, src_v, dst_frame, width); + } + return 0; +} + +// TODO(fbarchard): Deprecate, move or expand 422 support? +LIBYUV_API +int I422ToUYVY(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height) { + if (!src_y || !src_u || !src_v || !dst_frame || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_frame = dst_frame + (height - 1) * dst_stride_frame; + dst_stride_frame = -dst_stride_frame; + } + void (*I42xToUYVYRow)(const uint8* src_y, const uint8* src_u, + const uint8* src_v, uint8* dst_frame, int width) = + I42xToUYVYRow_C; +#if defined(HAS_I42XTOUYVYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(width, 16) && + IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && + IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) { + I42xToUYVYRow = I42xToUYVYRow_SSE2; + } +#endif + + for (int y = 0; y < height; ++y) { + I42xToUYVYRow(src_y, src_u, src_y, dst_frame, width); + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + dst_frame += dst_stride_frame; + } + return 0; +} + +LIBYUV_API +int I420ToUYVY(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height) { + if (!src_y || !src_u || !src_v || !dst_frame || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_frame = dst_frame + (height - 1) * dst_stride_frame; + dst_stride_frame = -dst_stride_frame; + } + void (*I42xToUYVYRow)(const uint8* src_y, const uint8* src_u, + const uint8* src_v, uint8* dst_frame, int width) = + I42xToUYVYRow_C; +#if defined(HAS_I42XTOUYVYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(width, 16) && + IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && + IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) { + I42xToUYVYRow = I42xToUYVYRow_SSE2; + } +#endif + + for (int y = 0; y < height - 1; y += 2) { + I42xToUYVYRow(src_y, src_u, src_v, dst_frame, width); + I42xToUYVYRow(src_y + src_stride_y, src_u, src_v, + dst_frame + dst_stride_frame, width); + src_y += src_stride_y * 2; + src_u += src_stride_u; + src_v += src_stride_v; + dst_frame += dst_stride_frame * 2; + } + if (height & 1) { + I42xToUYVYRow(src_y, src_u, src_v, dst_frame, width); + } + return 0; +} + +LIBYUV_API +int I420ToV210(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height) { + if (width * 16 / 6 > kMaxStride) { // Row buffer of V210 is required. + return -1; + } else if (!src_y || !src_u || !src_v || !dst_frame || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_frame = dst_frame + (height - 1) * dst_stride_frame; + dst_stride_frame = -dst_stride_frame; + } + + SIMD_ALIGNED(uint8 row[kMaxStride]); + void (*UYVYToV210Row)(const uint8* src_uyvy, uint8* dst_v210, int pix); + UYVYToV210Row = UYVYToV210Row_C; + + void (*I42xToUYVYRow)(const uint8* src_y, const uint8* src_u, + const uint8* src_v, uint8* dst_frame, int width) = + I42xToUYVYRow_C; +#if defined(HAS_I42XTOUYVYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(width, 16) && + IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16)) { + I42xToUYVYRow = I42xToUYVYRow_SSE2; + } +#endif + + for (int y = 0; y < height - 1; y += 2) { + I42xToUYVYRow(src_y, src_u, src_v, row, width); + UYVYToV210Row(row, dst_frame, width); + I42xToUYVYRow(src_y + src_stride_y, src_u, src_v, row, width); + UYVYToV210Row(row, dst_frame + dst_stride_frame, width); + + src_y += src_stride_y * 2; + src_u += src_stride_u; + src_v += src_stride_v; + dst_frame += dst_stride_frame * 2; + } + if (height & 1) { + I42xToUYVYRow(src_y, src_u, src_v, row, width); + UYVYToV210Row(row, dst_frame, width); + } + return 0; +} + +// Convert I420 to ARGB. +LIBYUV_API +int I420ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_y || !src_u || !src_v || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + void (*I422ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToARGBRow_C; +#if defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } + } +#elif defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } + } + } +#endif + + for (int y = 0; y < height; ++y) { + I422ToARGBRow(src_y, src_u, src_v, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to BGRA. +LIBYUV_API +int I420ToBGRA(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_bgra, int dst_stride_bgra, + int width, int height) { + if (!src_y || !src_u || !src_v || + !dst_bgra || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra; + dst_stride_bgra = -dst_stride_bgra; + } + void (*I422ToBGRARow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToBGRARow_C; +#if defined(HAS_I422TOBGRAROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToBGRARow = I422ToBGRARow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToBGRARow = I422ToBGRARow_NEON; + } + } +#elif defined(HAS_I422TOBGRAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToBGRARow = I422ToBGRARow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) { + I422ToBGRARow = I422ToBGRARow_SSSE3; + } + } + } +#endif + + for (int y = 0; y < height; ++y) { + I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width); + dst_bgra += dst_stride_bgra; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to ABGR. +LIBYUV_API +int I420ToABGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_abgr, int dst_stride_abgr, + int width, int height) { + if (!src_y || !src_u || !src_v || + !dst_abgr || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr; + dst_stride_abgr = -dst_stride_abgr; + } + void (*I422ToABGRRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToABGRRow_C; +#if defined(HAS_I422TOABGRROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToABGRRow = I422ToABGRRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToABGRRow = I422ToABGRRow_NEON; + } + } +#elif defined(HAS_I422TOABGRROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToABGRRow = I422ToABGRRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) { + I422ToABGRRow = I422ToABGRRow_SSSE3; + } + } + } +#endif + + for (int y = 0; y < height; ++y) { + I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width); + dst_abgr += dst_stride_abgr; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to RGBA. +LIBYUV_API +int I420ToRGBA(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_rgba, int dst_stride_rgba, + int width, int height) { + if (!src_y || !src_u || !src_v || + !dst_rgba || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba; + dst_stride_rgba = -dst_stride_rgba; + } + void (*I422ToRGBARow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToRGBARow_C; +#if defined(HAS_I422TORGBAROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToRGBARow = I422ToRGBARow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToRGBARow = I422ToRGBARow_NEON; + } + } +#elif defined(HAS_I422TORGBAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToRGBARow = I422ToRGBARow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) { + I422ToRGBARow = I422ToRGBARow_SSSE3; + } + } + } +#endif + + for (int y = 0; y < height; ++y) { + I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width); + dst_rgba += dst_stride_rgba; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to RGB24. +LIBYUV_API +int I420ToRGB24(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_rgb24, int dst_stride_rgb24, + int width, int height) { + if (!src_y || !src_u || !src_v || + !dst_rgb24 || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; + dst_stride_rgb24 = -dst_stride_rgb24; + } + void (*I422ToRGB24Row)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToRGB24Row_C; +#if defined(HAS_I422TORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToRGB24Row = I422ToRGB24Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToRGB24Row = I422ToRGB24Row_NEON; + } + } +#elif defined(HAS_I422TORGB24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRGB24Row = I422ToRGB24Row_Unaligned_SSSE3; + if (IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) { + I422ToRGB24Row = I422ToRGB24Row_SSSE3; + } + } + } +#endif + + for (int y = 0; y < height; ++y) { + I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, width); + dst_rgb24 += dst_stride_rgb24; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to RAW. +LIBYUV_API +int I420ToRAW(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_raw, int dst_stride_raw, + int width, int height) { + if (!src_y || !src_u || !src_v || + !dst_raw || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_raw = dst_raw + (height - 1) * dst_stride_raw; + dst_stride_raw = -dst_stride_raw; + } + void (*I422ToRAWRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToRAWRow_C; +#if defined(HAS_I422TORAWROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToRAWRow = I422ToRAWRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToRAWRow = I422ToRAWRow_NEON; + } + } +#elif defined(HAS_I422TORAWROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToRAWRow = I422ToRAWRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRAWRow = I422ToRAWRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) { + I422ToRAWRow = I422ToRAWRow_SSSE3; + } + } + } +#endif + + for (int y = 0; y < height; ++y) { + I422ToRAWRow(src_y, src_u, src_v, dst_raw, width); + dst_raw += dst_stride_raw; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to RGB565. +LIBYUV_API +int I420ToRGB565(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_rgb, int dst_stride_rgb, + int width, int height) { + if (!src_y || !src_u || !src_v || + !dst_rgb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb; + dst_stride_rgb = -dst_stride_rgb; + } + void (*I422ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToARGBRow_C; +#if defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } +#elif defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } +#endif + + SIMD_ALIGNED(uint8 row[kMaxStride]); + void (*ARGBToRGB565Row)(const uint8* src_rgb, uint8* dst_rgb, int pix) = + ARGBToRGB565Row_C; +#if defined(HAS_ARGBTORGB565ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + if (width * 2 <= kMaxStride) { + ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2; + } + if (IS_ALIGNED(width, 4)) { + ARGBToRGB565Row = ARGBToRGB565Row_SSE2; + } + } +#endif + + for (int y = 0; y < height; ++y) { + I422ToARGBRow(src_y, src_u, src_v, row, width); + ARGBToRGB565Row(row, dst_rgb, width); + dst_rgb += dst_stride_rgb; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to ARGB1555. +LIBYUV_API +int I420ToARGB1555(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_y || !src_u || !src_v || + !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + void (*I422ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToARGBRow_C; +#if defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } +#elif defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } +#endif + + SIMD_ALIGNED(uint8 row[kMaxStride]); + void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = + ARGBToARGB1555Row_C; +#if defined(HAS_ARGBTOARGB1555ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + if (width * 2 <= kMaxStride) { + ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2; + } + if (IS_ALIGNED(width, 4)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2; + } + } +#endif + + for (int y = 0; y < height; ++y) { + I422ToARGBRow(src_y, src_u, src_v, row, width); + ARGBToARGB1555Row(row, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to ARGB4444. +LIBYUV_API +int I420ToARGB4444(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_y || !src_u || !src_v || + !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + void (*I422ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToARGBRow_C; +#if defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } +#elif defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } +#endif + + SIMD_ALIGNED(uint8 row[kMaxStride]); + void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = + ARGBToARGB4444Row_C; +#if defined(HAS_ARGBTOARGB4444ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + if (width * 2 <= kMaxStride) { + ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2; + } + if (IS_ALIGNED(width, 4)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2; + } + } +#endif + + for (int y = 0; y < height; ++y) { + I422ToARGBRow(src_y, src_u, src_v, row, width); + ARGBToARGB4444Row(row, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to specified format +LIBYUV_API +int ConvertFromI420(const uint8* y, int y_stride, + const uint8* u, int u_stride, + const uint8* v, int v_stride, + uint8* dst_sample, int dst_sample_stride, + int width, int height, + uint32 format) { + if (!y || !u|| !v || !dst_sample || + width <= 0 || height == 0) { + return -1; + } + int r = 0; + switch (format) { + // Single plane formats + case FOURCC_YUY2: + r = I420ToYUY2(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, + width, height); + break; + case FOURCC_UYVY: + r = I420ToUYVY(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, + width, height); + break; + case FOURCC_V210: + r = I420ToV210(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : + (width + 47) / 48 * 128, + width, height); + break; + case FOURCC_RGBP: + r = I420ToRGB565(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, + width, height); + break; + case FOURCC_RGBO: + r = I420ToARGB1555(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, + width, height); + break; + case FOURCC_R444: + r = I420ToARGB4444(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, + width, height); + break; + case FOURCC_24BG: + r = I420ToRGB24(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 3, + width, height); + break; + case FOURCC_RAW: + r = I420ToRAW(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 3, + width, height); + break; + case FOURCC_ARGB: + r = I420ToARGB(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, + width, height); + break; + case FOURCC_BGRA: + r = I420ToBGRA(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, + width, height); + break; + case FOURCC_ABGR: + r = I420ToABGR(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, + width, height); + break; + case FOURCC_RGBA: + r = I420ToRGBA(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, + width, height); + break; + case FOURCC_BGGR: + r = I420ToBayerBGGR(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width, + width, height); + break; + case FOURCC_GBRG: + r = I420ToBayerGBRG(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width, + width, height); + break; + case FOURCC_GRBG: + r = I420ToBayerGRBG(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width, + width, height); + break; + case FOURCC_RGGB: + r = I420ToBayerRGGB(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width, + width, height); + break; + case FOURCC_I400: + r = I400Copy(y, y_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width, + width, height); + break; + // Triplanar formats + // TODO(fbarchard): halfstride instead of halfwidth + case FOURCC_I420: + case FOURCC_YU12: + case FOURCC_YV12: { + int halfwidth = (width + 1) / 2; + int halfheight = (height + 1) / 2; + uint8* dst_u; + uint8* dst_v; + if (format == FOURCC_YV12) { + dst_v = dst_sample + width * height; + dst_u = dst_v + halfwidth * halfheight; + } else { + dst_u = dst_sample + width * height; + dst_v = dst_u + halfwidth * halfheight; + } + r = I420Copy(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, width, + dst_u, halfwidth, + dst_v, halfwidth, + width, height); + break; + } + case FOURCC_I422: + case FOURCC_YV16: { + int halfwidth = (width + 1) / 2; + uint8* dst_u; + uint8* dst_v; + if (format == FOURCC_YV16) { + dst_v = dst_sample + width * height; + dst_u = dst_v + halfwidth * height; + } else { + dst_u = dst_sample + width * height; + dst_v = dst_u + halfwidth * height; + } + r = I420ToI422(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, width, + dst_u, halfwidth, + dst_v, halfwidth, + width, height); + break; + } + case FOURCC_I444: + case FOURCC_YV24: { + uint8* dst_u; + uint8* dst_v; + if (format == FOURCC_YV24) { + dst_v = dst_sample + width * height; + dst_u = dst_v + width * height; + } else { + dst_u = dst_sample + width * height; + dst_v = dst_u + width * height; + } + r = I420ToI444(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, width, + dst_u, width, + dst_v, width, + width, height); + break; + } + case FOURCC_I411: { + int quarterwidth = (width + 3) / 4; + uint8* dst_u = dst_sample + width * height; + uint8* dst_v = dst_u + quarterwidth * height; + r = I420ToI411(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, width, + dst_u, quarterwidth, + dst_v, quarterwidth, + width, height); + break; + } + + // Formats not supported - MJPG, biplanar, some rgb formats. + default: + return -1; // unknown fourcc - return failure code. + } + return r; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/files/source/cpu_id.cc b/files/source/cpu_id.cc index cc44e215..2e96d9b9 100644 --- a/files/source/cpu_id.cc +++ b/files/source/cpu_id.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -9,66 +9,206 @@ */ #include "libyuv/cpu_id.h" -#include "libyuv/basic_types.h" // for CPU_X86 #ifdef _MSC_VER -#include <intrin.h> +#include <intrin.h> // For __cpuid() #endif +#if !defined(__CLR_VER) && defined(_M_X64) && \ + defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) +#include <immintrin.h> // For _xgetbv() +#endif + +#include <stdlib.h> // For getenv() + +// For ArmCpuCaps() but unittested on all platforms +#include <stdio.h> +#include <string.h> + +#include "libyuv/basic_types.h" // For CPU_X86 // TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux. #if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__) -static inline void __cpuid(int cpu_info[4], int info_type) { - __asm__ volatile ( - "mov %%ebx, %%edi\n" - "cpuid\n" - "xchg %%edi, %%ebx\n" +static __inline void __cpuid(int cpu_info[4], int info_type) { + asm volatile ( // NOLINT + "mov %%ebx, %%edi \n" + "cpuid \n" + "xchg %%edi, %%ebx \n" : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) - : "a"(info_type) - ); + : "a"(info_type)); } #elif defined(__i386__) || defined(__x86_64__) -static inline void __cpuid(int cpu_info[4], int info_type) { - __asm__ volatile ( - "cpuid\n" +static __inline void __cpuid(int cpu_info[4], int info_type) { + asm volatile ( // NOLINT + "cpuid \n" : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) - : "a"(info_type) - ); + : "a"(info_type)); } #endif +#ifdef __cplusplus namespace libyuv { +extern "C" { +#endif + +// Low level cpuid for X86. Returns zeros on other CPUs. +#if !defined(__CLR_VER) && (defined(_M_IX86) || defined(_M_X64) || \ + defined(__i386__) || defined(__x86_64__)) +LIBYUV_API +void CpuId(int cpu_info[4], int info_type) { + __cpuid(cpu_info, info_type); +} +#else +LIBYUV_API +void CpuId(int cpu_info[4], int) { + cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0; +} +#endif + +// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers. +#if !defined(__CLR_VER) && defined(_M_X64) && \ + defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) +#define HAS_XGETBV +static uint32 XGetBV(unsigned int xcr) { + return static_cast<uint32>(_xgetbv(xcr)); +} +#elif !defined(__CLR_VER) && defined(_M_IX86) +#define HAS_XGETBV +__declspec(naked) __declspec(align(16)) +static uint32 XGetBV(unsigned int xcr) { + __asm { + mov ecx, [esp + 4] // xcr + _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // xgetbv for vs2005. + ret + } +} +#elif defined(__i386__) || defined(__x86_64__) +#define HAS_XGETBV +static uint32 XGetBV(unsigned int xcr) { + uint32 xcr_feature_mask; + asm volatile ( // NOLINT + ".byte 0x0f, 0x01, 0xd0\n" + : "=a"(xcr_feature_mask) + : "c"(xcr) + : "memory", "cc", "edx"); // edx unused. + return xcr_feature_mask; +} +#endif +#ifdef HAS_XGETBV +static const int kXCR_XFEATURE_ENABLED_MASK = 0; +#endif + +// based on libvpx arm_cpudetect.c +// For Arm, but public to allow testing on any CPU +LIBYUV_API +int ArmCpuCaps(const char* cpuinfo_name) { + int flags = 0; + FILE* fin = fopen(cpuinfo_name, "r"); + if (fin) { + char buf[512]; + while (fgets(buf, 511, fin)) { + if (memcmp(buf, "Features", 8) == 0) { + flags |= kCpuInitialized; + char* p = strstr(buf, " neon"); + if (p && (p[5] == ' ' || p[5] == '\n')) { + flags |= kCpuHasNEON; + break; + } + } + } + fclose(fin); + } + return flags; +} // CPU detect function for SIMD instruction sets. -static int cpu_info_ = 0; +LIBYUV_API +int cpu_info_ = 0; -// TODO(fbarchard): (cpu_info[2] & 0x10000000 ? kCpuHasAVX : 0) -static void InitCpuFlags() { -#ifdef CPU_X86 +// Test environment variable for disabling CPU features. Any non-zero value +// to disable. Zero ignored to make it easy to set the variable on/off. +static bool TestEnv(const char* name) { + const char* var = getenv(name); + if (var) { + if (var[0] != '0') { + return true; + } + } + return false; +} + +LIBYUV_API +int InitCpuFlags(void) { +#if !defined(__CLR_VER) && defined(CPU_X86) int cpu_info[4]; __cpuid(cpu_info, 1); - cpu_info_ = (cpu_info[3] & 0x04000000 ? kCpuHasSSE2 : 0) | - (cpu_info[2] & 0x00000200 ? kCpuHasSSSE3 : 0) | - kCpuInitialized; + cpu_info_ = ((cpu_info[3] & 0x04000000) ? kCpuHasSSE2 : 0) | + ((cpu_info[2] & 0x00000200) ? kCpuHasSSSE3 : 0) | + ((cpu_info[2] & 0x00080000) ? kCpuHasSSE41 : 0) | + ((cpu_info[2] & 0x00100000) ? kCpuHasSSE42 : 0) | + (((cpu_info[2] & 0x18000000) == 0x18000000) ? kCpuHasAVX : 0) | + kCpuInitialized | kCpuHasX86; +#ifdef HAS_XGETBV + if (cpu_info_ & kCpuHasAVX) { + __cpuid(cpu_info, 7); + if ((cpu_info[1] & 0x00000020) && + ((XGetBV(kXCR_XFEATURE_ENABLED_MASK) & 0x06) == 0x06)) { + cpu_info_ |= kCpuHasAVX2; + } + } +#endif + // environment variable overrides for testing. + if (TestEnv("LIBYUV_DISABLE_X86")) { + cpu_info_ &= ~kCpuHasX86; + } + if (TestEnv("LIBYUV_DISABLE_SSE2")) { + cpu_info_ &= ~kCpuHasSSE2; + } + if (TestEnv("LIBYUV_DISABLE_SSSE3")) { + cpu_info_ &= ~kCpuHasSSSE3; + } + if (TestEnv("LIBYUV_DISABLE_SSE41")) { + cpu_info_ &= ~kCpuHasSSE41; + } + if (TestEnv("LIBYUV_DISABLE_SSE42")) { + cpu_info_ &= ~kCpuHasSSE42; + } + if (TestEnv("LIBYUV_DISABLE_AVX")) { + cpu_info_ &= ~kCpuHasAVX; + } + if (TestEnv("LIBYUV_DISABLE_AVX2")) { + cpu_info_ &= ~kCpuHasAVX2; + } + if (TestEnv("LIBYUV_DISABLE_ASM")) { + cpu_info_ = kCpuInitialized; + } +#elif defined(__arm__) +#if defined(__linux__) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) + // linux arm parse text file for neon detect. + cpu_info_ = ArmCpuCaps("/proc/cpuinfo"); #elif defined(__ARM_NEON__) // gcc -mfpu=neon defines __ARM_NEON__ // Enable Neon if you want support for Neon and Arm, and use MaskCpuFlags // to disable Neon on devices that do not have it. - cpu_info_ = kCpuHasNEON | kCpuInitialized; -#else - cpu_info_ = kCpuInitialized; + cpu_info_ = kCpuHasNEON; #endif + cpu_info_ |= kCpuInitialized | kCpuHasARM; + if (TestEnv("LIBYUV_DISABLE_NEON")) { + cpu_info_ &= ~kCpuHasNEON; + } + if (TestEnv("LIBYUV_DISABLE_ASM")) { + cpu_info_ = kCpuInitialized; + } +#endif // __arm__ + return cpu_info_; } +LIBYUV_API void MaskCpuFlags(int enable_flags) { InitCpuFlags(); - cpu_info_ &= enable_flags; -} - -bool TestCpuFlag(int flag) { - if (0 == cpu_info_) { - InitCpuFlags(); - } - return cpu_info_ & flag ? true : false; + cpu_info_ = (cpu_info_ & enable_flags) | kCpuInitialized; } +#ifdef __cplusplus +} // extern "C" } // namespace libyuv +#endif diff --git a/files/source/format_conversion.cc b/files/source/format_conversion.cc index 958f44c4..ed12de88 100644 --- a/files/source/format_conversion.cc +++ b/files/source/format_conversion.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -8,66 +8,73 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include <assert.h> +#include "libyuv/format_conversion.h" +#include "libyuv/basic_types.h" #include "libyuv/cpu_id.h" -#include "video_common.h" -#include "row.h" - -#define kMaxStride (2048 * 4) +#include "libyuv/video_common.h" +#include "libyuv/row.h" +#ifdef __cplusplus namespace libyuv { +extern "C" { +#endif // Note: to do this with Neon vld4.8 would load ARGB values into 4 registers -// and vst would select which 2 components to write. The low level would need +// and vst would select which 2 components to write. The low level would need // to be ARGBToBG, ARGBToGB, ARGBToRG, ARGBToGR -#if defined(WIN32) && !defined(COVERAGE_ENABLED) +#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) #define HAS_ARGBTOBAYERROW_SSSE3 -__declspec(naked) +__declspec(naked) __declspec(align(16)) static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, uint32 selector, int pix) { __asm { mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_bayer - movd xmm7, [esp + 12] // selector + movd xmm5, [esp + 12] // selector mov ecx, [esp + 16] // pix - pshufd xmm7, xmm7, 0 + pshufd xmm5, xmm5, 0 + align 16 wloop: movdqa xmm0, [eax] lea eax, [eax + 16] - pshufb xmm0, xmm7 + pshufb xmm0, xmm5 + sub ecx, 4 movd [edx], xmm0 lea edx, [edx + 4] - sub ecx, 4 - ja wloop + jg wloop ret } } -#elif (defined(__x86_64__) || defined(__i386__)) && \ - !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) +#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__)) #define HAS_ARGBTOBAYERROW_SSSE3 static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, uint32 selector, int pix) { - asm volatile( - "movd %3,%%xmm7\n" - "pshufd $0x0,%%xmm7,%%xmm7\n" -"1:" - "movdqa (%0),%%xmm0\n" - "lea 0x10(%0),%0\n" - "pshufb %%xmm7,%%xmm0\n" - "movd %%xmm0,(%1)\n" - "lea 0x4(%1),%1\n" - "sub $0x4,%2\n" - "ja 1b\n" + asm volatile ( + "movd %3,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + ".p2align 4 \n" +"1: \n" + "movdqa (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "sub $0x4,%2 \n" + "movd %%xmm0,(%1) \n" + "lea 0x4(%1),%1 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_bayer), // %1 "+r"(pix) // %2 - : "r"(selector) // %3 - : "memory" + : "g"(selector) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm5" +#endif + ); } #endif @@ -77,7 +84,7 @@ static void ARGBToBayerRow_C(const uint8* src_argb, int index0 = selector & 0xff; int index1 = (selector >> 8) & 0xff; // Copy a row of Bayer. - for (int x = 0; x < (pix - 1); x += 2) { + for (int x = 0; x < pix - 1; x += 2) { dst_bayer[0] = src_argb[index0]; dst_bayer[1] = src_argb[index1]; src_argb += 8; @@ -96,243 +103,258 @@ static uint32 GenerateSelector(int select0, int select1) { static_cast<uint32>((select1 + 12) << 24); } -// Converts 32 bit ARGB to any Bayer RGB format. -int ARGBToBayerRGB(const uint8* src_rgb, int src_stride_rgb, - uint8* dst_bayer, int dst_stride_bayer, - uint32 dst_fourcc_bayer, - int width, int height) { - if (height < 0) { - height = -height; - src_rgb = src_rgb + (height - 1) * src_stride_rgb; - src_stride_rgb = -src_stride_rgb; - } - void (*ARGBToBayerRow)(const uint8* src_argb, - uint8* dst_bayer, uint32 selector, int pix); -#if defined(HAS_ARGBTOBAYERROW_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && - (width % 4 == 0) && - IS_ALIGNED(src_rgb, 16) && (src_stride_rgb % 16 == 0) && - IS_ALIGNED(dst_bayer, 4) && (dst_stride_bayer % 4 == 0)) { - ARGBToBayerRow = ARGBToBayerRow_SSSE3; - } else -#endif - { - ARGBToBayerRow = ARGBToBayerRow_C; - } - - int blue_index = 0; - int green_index = 1; - int red_index = 2; - +static int MakeSelectors(const int blue_index, + const int green_index, + const int red_index, + uint32 dst_fourcc_bayer, + uint32 *index_map) { // Now build a lookup table containing the indices for the four pixels in each // 2x2 Bayer grid. - uint32 index_map[2]; switch (dst_fourcc_bayer) { - default: - assert(false); - case FOURCC_RGGB: - index_map[0] = GenerateSelector(red_index, green_index); - index_map[1] = GenerateSelector(green_index, blue_index); - break; case FOURCC_BGGR: index_map[0] = GenerateSelector(blue_index, green_index); index_map[1] = GenerateSelector(green_index, red_index); break; - case FOURCC_GRBG: - index_map[0] = GenerateSelector(green_index, red_index); - index_map[1] = GenerateSelector(blue_index, green_index); - break; case FOURCC_GBRG: index_map[0] = GenerateSelector(green_index, blue_index); index_map[1] = GenerateSelector(red_index, green_index); break; + case FOURCC_RGGB: + index_map[0] = GenerateSelector(red_index, green_index); + index_map[1] = GenerateSelector(green_index, blue_index); + break; + case FOURCC_GRBG: + index_map[0] = GenerateSelector(green_index, red_index); + index_map[1] = GenerateSelector(blue_index, green_index); + break; + default: + return -1; // Bad FourCC + } + return 0; +} + +// Converts 32 bit ARGB to Bayer RGB formats. +LIBYUV_API +int ARGBToBayer(const uint8* src_argb, int src_stride_argb, + uint8* dst_bayer, int dst_stride_bayer, + int width, int height, + uint32 dst_fourcc_bayer) { + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix) = ARGBToBayerRow_C; +#if defined(HAS_ARGBTOBAYERROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + IS_ALIGNED(width, 4) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToBayerRow = ARGBToBayerRow_SSSE3; + } +#endif + const int blue_index = 0; // Offsets for ARGB format + const int green_index = 1; + const int red_index = 2; + uint32 index_map[2]; + if (MakeSelectors(blue_index, green_index, red_index, + dst_fourcc_bayer, index_map)) { + return -1; // Bad FourCC } - // Now convert. for (int y = 0; y < height; ++y) { - ARGBToBayerRow(src_rgb, dst_bayer, index_map[y & 1], width); - src_rgb += src_stride_rgb; + ARGBToBayerRow(src_argb, dst_bayer, index_map[y & 1], width); + src_argb += src_stride_argb; dst_bayer += dst_stride_bayer; } return 0; } -#define AVG(a,b) (((a) + (b)) >> 1) +#define AVG(a, b) (((a) + (b)) >> 1) static void BayerRowBG(const uint8* src_bayer0, int src_stride_bayer, - uint8* dst_rgb, int pix) { + uint8* dst_argb, int pix) { const uint8* src_bayer1 = src_bayer0 + src_stride_bayer; uint8 g = src_bayer0[1]; uint8 r = src_bayer1[1]; - for (int x = 0; x < (pix - 2); x += 2) { - dst_rgb[0] = src_bayer0[0]; - dst_rgb[1] = AVG(g, src_bayer0[1]); - dst_rgb[2] = AVG(r, src_bayer1[1]); - dst_rgb[3] = 255U; - dst_rgb[4] = AVG(src_bayer0[0], src_bayer0[2]); - dst_rgb[5] = src_bayer0[1]; - dst_rgb[6] = src_bayer1[1]; - dst_rgb[7] = 255U; + for (int x = 0; x < pix - 2; x += 2) { + dst_argb[0] = src_bayer0[0]; + dst_argb[1] = AVG(g, src_bayer0[1]); + dst_argb[2] = AVG(r, src_bayer1[1]); + dst_argb[3] = 255U; + dst_argb[4] = AVG(src_bayer0[0], src_bayer0[2]); + dst_argb[5] = src_bayer0[1]; + dst_argb[6] = src_bayer1[1]; + dst_argb[7] = 255U; g = src_bayer0[1]; r = src_bayer1[1]; src_bayer0 += 2; src_bayer1 += 2; - dst_rgb += 8; - } - dst_rgb[0] = src_bayer0[0]; - dst_rgb[1] = AVG(g, src_bayer0[1]); - dst_rgb[2] = AVG(r, src_bayer1[1]); - dst_rgb[3] = 255U; - dst_rgb[4] = src_bayer0[0]; - dst_rgb[5] = src_bayer0[1]; - dst_rgb[6] = src_bayer1[1]; - dst_rgb[7] = 255U; + dst_argb += 8; + } + dst_argb[0] = src_bayer0[0]; + dst_argb[1] = AVG(g, src_bayer0[1]); + dst_argb[2] = AVG(r, src_bayer1[1]); + dst_argb[3] = 255U; + if (!(pix & 1)) { + dst_argb[4] = src_bayer0[0]; + dst_argb[5] = src_bayer0[1]; + dst_argb[6] = src_bayer1[1]; + dst_argb[7] = 255U; + } } static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer, - uint8* dst_rgb, int pix) { + uint8* dst_argb, int pix) { const uint8* src_bayer1 = src_bayer0 + src_stride_bayer; uint8 g = src_bayer0[1]; uint8 b = src_bayer1[1]; - for (int x = 0; x < (pix - 2); x += 2) { - dst_rgb[0] = AVG(b, src_bayer1[1]); - dst_rgb[1] = AVG(g, src_bayer0[1]); - dst_rgb[2] = src_bayer0[0]; - dst_rgb[3] = 255U; - dst_rgb[4] = src_bayer1[1]; - dst_rgb[5] = src_bayer0[1]; - dst_rgb[6] = AVG(src_bayer0[0], src_bayer0[2]); - dst_rgb[7] = 255U; + for (int x = 0; x < pix - 2; x += 2) { + dst_argb[0] = AVG(b, src_bayer1[1]); + dst_argb[1] = AVG(g, src_bayer0[1]); + dst_argb[2] = src_bayer0[0]; + dst_argb[3] = 255U; + dst_argb[4] = src_bayer1[1]; + dst_argb[5] = src_bayer0[1]; + dst_argb[6] = AVG(src_bayer0[0], src_bayer0[2]); + dst_argb[7] = 255U; g = src_bayer0[1]; b = src_bayer1[1]; src_bayer0 += 2; src_bayer1 += 2; - dst_rgb += 8; - } - dst_rgb[0] = AVG(b, src_bayer1[1]); - dst_rgb[1] = AVG(g, src_bayer0[1]); - dst_rgb[2] = src_bayer0[0]; - dst_rgb[3] = 255U; - dst_rgb[4] = src_bayer1[1]; - dst_rgb[5] = src_bayer0[1]; - dst_rgb[6] = src_bayer0[0]; - dst_rgb[7] = 255U; + dst_argb += 8; + } + dst_argb[0] = AVG(b, src_bayer1[1]); + dst_argb[1] = AVG(g, src_bayer0[1]); + dst_argb[2] = src_bayer0[0]; + dst_argb[3] = 255U; + if (!(pix & 1)) { + dst_argb[4] = src_bayer1[1]; + dst_argb[5] = src_bayer0[1]; + dst_argb[6] = src_bayer0[0]; + dst_argb[7] = 255U; + } } static void BayerRowGB(const uint8* src_bayer0, int src_stride_bayer, - uint8* dst_rgb, int pix) { + uint8* dst_argb, int pix) { const uint8* src_bayer1 = src_bayer0 + src_stride_bayer; uint8 b = src_bayer0[1]; - for (int x = 0; x < (pix - 2); x += 2) { - dst_rgb[0] = AVG(b, src_bayer0[1]); - dst_rgb[1] = src_bayer0[0]; - dst_rgb[2] = src_bayer1[0]; - dst_rgb[3] = 255U; - dst_rgb[4] = src_bayer0[1]; - dst_rgb[5] = AVG(src_bayer0[0], src_bayer0[2]); - dst_rgb[6] = AVG(src_bayer1[0], src_bayer1[2]); - dst_rgb[7] = 255U; + for (int x = 0; x < pix - 2; x += 2) { + dst_argb[0] = AVG(b, src_bayer0[1]); + dst_argb[1] = src_bayer0[0]; + dst_argb[2] = src_bayer1[0]; + dst_argb[3] = 255U; + dst_argb[4] = src_bayer0[1]; + dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]); + dst_argb[6] = AVG(src_bayer1[0], src_bayer1[2]); + dst_argb[7] = 255U; b = src_bayer0[1]; src_bayer0 += 2; src_bayer1 += 2; - dst_rgb += 8; - } - dst_rgb[0] = AVG(b, src_bayer0[1]); - dst_rgb[1] = src_bayer0[0]; - dst_rgb[2] = src_bayer1[0]; - dst_rgb[3] = 255U; - dst_rgb[4] = src_bayer0[1]; - dst_rgb[5] = src_bayer0[0]; - dst_rgb[6] = src_bayer1[0]; - dst_rgb[7] = 255U; + dst_argb += 8; + } + dst_argb[0] = AVG(b, src_bayer0[1]); + dst_argb[1] = src_bayer0[0]; + dst_argb[2] = src_bayer1[0]; + dst_argb[3] = 255U; + if (!(pix & 1)) { + dst_argb[4] = src_bayer0[1]; + dst_argb[5] = src_bayer0[0]; + dst_argb[6] = src_bayer1[0]; + dst_argb[7] = 255U; + } } static void BayerRowGR(const uint8* src_bayer0, int src_stride_bayer, - uint8* dst_rgb, int pix) { + uint8* dst_argb, int pix) { const uint8* src_bayer1 = src_bayer0 + src_stride_bayer; uint8 r = src_bayer0[1]; - for (int x = 0; x < (pix - 2); x += 2) { - dst_rgb[0] = src_bayer1[0]; - dst_rgb[1] = src_bayer0[0]; - dst_rgb[2] = AVG(r, src_bayer0[1]); - dst_rgb[3] = 255U; - dst_rgb[4] = AVG(src_bayer1[0], src_bayer1[2]); - dst_rgb[5] = AVG(src_bayer0[0], src_bayer0[2]); - dst_rgb[6] = src_bayer0[1]; - dst_rgb[7] = 255U; + for (int x = 0; x < pix - 2; x += 2) { + dst_argb[0] = src_bayer1[0]; + dst_argb[1] = src_bayer0[0]; + dst_argb[2] = AVG(r, src_bayer0[1]); + dst_argb[3] = 255U; + dst_argb[4] = AVG(src_bayer1[0], src_bayer1[2]); + dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]); + dst_argb[6] = src_bayer0[1]; + dst_argb[7] = 255U; r = src_bayer0[1]; src_bayer0 += 2; src_bayer1 += 2; - dst_rgb += 8; - } - dst_rgb[0] = src_bayer1[0]; - dst_rgb[1] = src_bayer0[0]; - dst_rgb[2] = AVG(r, src_bayer0[1]); - dst_rgb[3] = 255U; - dst_rgb[4] = src_bayer1[0]; - dst_rgb[5] = src_bayer0[0]; - dst_rgb[6] = src_bayer0[1]; - dst_rgb[7] = 255U; + dst_argb += 8; + } + dst_argb[0] = src_bayer1[0]; + dst_argb[1] = src_bayer0[0]; + dst_argb[2] = AVG(r, src_bayer0[1]); + dst_argb[3] = 255U; + if (!(pix & 1)) { + dst_argb[4] = src_bayer1[0]; + dst_argb[5] = src_bayer0[0]; + dst_argb[6] = src_bayer0[1]; + dst_argb[7] = 255U; + } } // Converts any Bayer RGB format to ARGB. -int BayerRGBToARGB(const uint8* src_bayer, int src_stride_bayer, - uint32 src_fourcc_bayer, - uint8* dst_rgb, int dst_stride_rgb, - int width, int height) { +LIBYUV_API +int BayerToARGB(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_argb, int dst_stride_argb, + int width, int height, + uint32 src_fourcc_bayer) { if (height < 0) { height = -height; - dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb; - dst_stride_rgb = -dst_stride_rgb; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; } void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer, - uint8* dst_rgb, int pix); + uint8* dst_argb, int pix); void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer, - uint8* dst_rgb, int pix); - + uint8* dst_argb, int pix); switch (src_fourcc_bayer) { - default: - assert(false); - case FOURCC_RGGB: - BayerRow0 = BayerRowRG; - BayerRow1 = BayerRowGB; - break; case FOURCC_BGGR: BayerRow0 = BayerRowBG; BayerRow1 = BayerRowGR; break; + case FOURCC_GBRG: + BayerRow0 = BayerRowGB; + BayerRow1 = BayerRowRG; + break; case FOURCC_GRBG: BayerRow0 = BayerRowGR; BayerRow1 = BayerRowBG; break; - case FOURCC_GBRG: - BayerRow0 = BayerRowGB; - BayerRow1 = BayerRowRG; + case FOURCC_RGGB: + BayerRow0 = BayerRowRG; + BayerRow1 = BayerRowGB; break; + default: + return -1; // Bad FourCC } - for (int y = 0; y < (height - 1); y += 2) { - BayerRow0(src_bayer, src_stride_bayer, dst_rgb, width); + for (int y = 0; y < height - 1; y += 2) { + BayerRow0(src_bayer, src_stride_bayer, dst_argb, width); BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer, - dst_rgb + dst_stride_rgb, width); + dst_argb + dst_stride_argb, width); src_bayer += src_stride_bayer * 2; - dst_rgb += dst_stride_rgb * 2; + dst_argb += dst_stride_argb * 2; } if (height & 1) { - BayerRow0(src_bayer, -src_stride_bayer, dst_rgb, width); + BayerRow0(src_bayer, -src_stride_bayer, dst_argb, width); } return 0; } // Converts any Bayer RGB format to ARGB. -int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer, - uint32 src_fourcc_bayer, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +LIBYUV_API +int BayerToI420(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height, + uint32 src_fourcc_bayer) { if (width * 4 > kMaxStride) { - return -1; + return -1; // Size too large for row buffer } // Negative height means invert the image. if (height < 0) { @@ -346,60 +368,50 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer, dst_stride_v = -dst_stride_v; } void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer, - uint8* dst_rgb, int pix); + uint8* dst_argb, int pix); void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer, - uint8* dst_rgb, int pix); - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix); + uint8* dst_argb, int pix); + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; SIMD_ALIGNED(uint8 row[kMaxStride * 2]); #if defined(HAS_ARGBTOYROW_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && - (width % 16 == 0) && - IS_ALIGNED(row, 16) && (kMaxStride % 16 == 0) && - IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { + if (TestCpuFlag(kCpuHasSSSE3) && + IS_ALIGNED(width, 16) && + IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { ARGBToYRow = ARGBToYRow_SSSE3; - } else -#endif - { - ARGBToYRow = ARGBToYRow_C; } +#endif #if defined(HAS_ARGBTOUVROW_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && - (width % 16 == 0) && - IS_ALIGNED(row, 16) && (kMaxStride % 16 == 0) && - IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && - IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) { + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_SSSE3; - } else -#endif - { - ARGBToUVRow = ARGBToUVRow_C; } +#endif switch (src_fourcc_bayer) { - default: - assert(false); - case FOURCC_RGGB: - BayerRow0 = BayerRowRG; - BayerRow1 = BayerRowGB; - break; case FOURCC_BGGR: BayerRow0 = BayerRowBG; BayerRow1 = BayerRowGR; break; + case FOURCC_GBRG: + BayerRow0 = BayerRowGB; + BayerRow1 = BayerRowRG; + break; case FOURCC_GRBG: BayerRow0 = BayerRowGR; BayerRow1 = BayerRowBG; break; - case FOURCC_GBRG: - BayerRow0 = BayerRowGB; - BayerRow1 = BayerRowRG; + case FOURCC_RGGB: + BayerRow0 = BayerRowRG; + BayerRow1 = BayerRowGB; break; + default: + return -1; // Bad FourCC } - for (int y = 0; y < (height - 1); y += 2) { + for (int y = 0; y < height - 1; y += 2) { BayerRow0(src_bayer, src_stride_bayer, row, width); BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer, row + kMaxStride, width); @@ -411,7 +423,6 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer, dst_u += dst_stride_u; dst_v += dst_stride_v; } - // TODO(fbarchard): Make sure this filters properly if (height & 1) { BayerRow0(src_bayer, src_stride_bayer, row, width); ARGBToUVRow(row, 0, dst_u, dst_v, width); @@ -420,4 +431,124 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer, return 0; } +// Convert I420 to Bayer. +LIBYUV_API +int I420ToBayer(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_bayer, int dst_stride_bayer, + int width, int height, + uint32 dst_fourcc_bayer) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + int halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + void (*I422ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToARGBRow_C; +#if defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } +#elif defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } +#endif + SIMD_ALIGNED(uint8 row[kMaxStride]); + void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix) = ARGBToBayerRow_C; +#if defined(HAS_ARGBTOBAYERROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) { + ARGBToBayerRow = ARGBToBayerRow_SSSE3; + } +#endif + const int blue_index = 0; // Offsets for ARGB format + const int green_index = 1; + const int red_index = 2; + uint32 index_map[2]; + if (MakeSelectors(blue_index, green_index, red_index, + dst_fourcc_bayer, index_map)) { + return -1; // Bad FourCC + } + + for (int y = 0; y < height; ++y) { + I422ToARGBRow(src_y, src_u, src_v, row, width); + ARGBToBayerRow(row, dst_bayer, index_map[y & 1], width); + dst_bayer += dst_stride_bayer; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +#define MAKEBAYERFOURCC(BAYER) \ +LIBYUV_API \ +int Bayer##BAYER##ToI420(const uint8* src_bayer, int src_stride_bayer, \ + uint8* dst_y, int dst_stride_y, \ + uint8* dst_u, int dst_stride_u, \ + uint8* dst_v, int dst_stride_v, \ + int width, int height) { \ + return BayerToI420(src_bayer, src_stride_bayer, \ + dst_y, dst_stride_y, \ + dst_u, dst_stride_u, \ + dst_v, dst_stride_v, \ + width, height, \ + FOURCC_##BAYER); \ +} \ + \ +LIBYUV_API \ +int I420ToBayer##BAYER(const uint8* src_y, int src_stride_y, \ + const uint8* src_u, int src_stride_u, \ + const uint8* src_v, int src_stride_v, \ + uint8* dst_bayer, int dst_stride_bayer, \ + int width, int height) { \ + return I420ToBayer(src_y, src_stride_y, \ + src_u, src_stride_u, \ + src_v, src_stride_v, \ + dst_bayer, dst_stride_bayer, \ + width, height, \ + FOURCC_##BAYER); \ +} \ + \ +LIBYUV_API \ +int ARGBToBayer##BAYER(const uint8* src_argb, int src_stride_argb, \ + uint8* dst_bayer, int dst_stride_bayer, \ + int width, int height) { \ + return ARGBToBayer(src_argb, src_stride_argb, \ + dst_bayer, dst_stride_bayer, \ + width, height, \ + FOURCC_##BAYER); \ +} \ + \ +LIBYUV_API \ +int Bayer##BAYER##ToARGB(const uint8* src_bayer, int src_stride_bayer, \ + uint8* dst_argb, int dst_stride_argb, \ + int width, int height) { \ + return BayerToARGB(src_bayer, src_stride_bayer, \ + dst_argb, dst_stride_argb, \ + width, height, \ + FOURCC_##BAYER); \ +} + +MAKEBAYERFOURCC(BGGR) +MAKEBAYERFOURCC(GBRG) +MAKEBAYERFOURCC(GRBG) +MAKEBAYERFOURCC(RGGB) + +#ifdef __cplusplus +} // extern "C" } // namespace libyuv +#endif diff --git a/files/source/general.cc b/files/source/general.cc deleted file mode 100644 index 9d39f9bf..00000000 --- a/files/source/general.cc +++ /dev/null @@ -1,284 +0,0 @@ -/* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/general.h" - -#include <string.h> // memcpy(), memset() - -#include "libyuv/planar_functions.h" - -namespace libyuv { - -int -I420Mirror(const uint8* src_yplane, int src_ystride, - const uint8* src_uplane, int src_ustride, - const uint8* src_vplane, int src_vstride, - uint8* dst_yplane, int dst_ystride, - uint8* dst_uplane, int dst_ustride, - uint8* dst_vplane, int dst_vstride, - int width, int height) { - if (src_yplane == NULL || src_uplane == NULL || src_vplane == NULL || - dst_yplane == NULL || dst_uplane == NULL || dst_vplane == NULL) { - return -1; - } - - int indO = 0; - int indS = 0; - int wind, hind; - uint8 tmpVal, tmpValU, tmpValV; - // Will swap two values per iteration - const int halfWidth = (width + 1) >> 1; - - // Y - for (wind = 0; wind < halfWidth; wind++) { - for (hind = 0; hind < height; hind++) { - indO = hind * src_ystride + wind; - indS = hind * dst_ystride + (width - wind - 1); - tmpVal = src_yplane[indO]; - dst_yplane[indO] = src_yplane[indS]; - dst_yplane[indS] = tmpVal; - } - } - - const int halfHeight = (height + 1) >> 1; - const int halfSrcuvStride = (height + 1) >> 1; - const int halfuvWidth = (width + 1) >> 2; - - for (wind = 0; wind < halfuvWidth; wind++) { - for (hind = 0; hind < halfHeight; hind++) { - indO = hind * halfSrcuvStride + wind; - indS = hind * halfSrcuvStride + (halfuvWidth - wind - 1); - // U - tmpValU = src_uplane[indO]; - dst_uplane[indO] = src_uplane[indS]; - dst_uplane[indS] = tmpValU; - // V - tmpValV = src_vplane[indO]; - dst_vplane[indO] = src_vplane[indS]; - dst_vplane[indS] = tmpValV; - } - } - return 0; -} - -// Make a center cut -int -I420Crop(uint8* frame, - int src_width, int src_height, - int dst_width, int dst_height) -{ - if (frame == NULL) - return -1; - - if (src_width == dst_width && src_height == dst_height) { - // Nothing to do - return 3 * dst_height * dst_width / 2; - } - if (dst_width > src_width || dst_height > src_height) { - // error - return -1; - } - int i = 0; - int m = 0; - int loop = 0; - int half_dst_width = dst_width / 2; - int halfdst_height = dst_height / 2; - int halfsrc_width = src_width / 2; - int half_dst_height= src_height / 2; - int crop_height = ( src_height - dst_height ) / 2; - int crop_width = ( src_width - dst_width ) / 2; - - for (i = src_width * crop_height + crop_width; loop < dst_height ; - loop++, i += src_width) { - memcpy(&frame[m],&frame[i],dst_width); - m += dst_width; - } - i = src_width * src_height; // ilum - loop = 0; - for ( i += (halfsrc_width * crop_height / 2 + crop_width / 2); - loop < halfdst_height; loop++,i += halfsrc_width) { - memcpy(&frame[m],&frame[i],half_dst_width); - m += half_dst_width; - } - loop = 0; - i = src_width * src_height + half_dst_height * halfsrc_width; // ilum + Cr - for ( i += (halfsrc_width * crop_height / 2 + crop_width / 2); - loop < halfdst_height; loop++, i += halfsrc_width) { - memcpy(&frame[m],&frame[i],half_dst_width); - m += half_dst_width; - } - return 0; -} - - -int -I420CropPad(const uint8* src_frame, int src_width, - int src_height, uint8* dst_frame, - int dst_width, int dst_height) -{ - if (src_width < 1 || dst_width < 1 || src_height < 1 || dst_height < 1) { - return -1; - } - if (src_width == dst_width && src_height == dst_height) { - memcpy(dst_frame, src_frame, 3 * dst_width * (dst_height >> 1)); - } else { - if (src_height < dst_height) { - // pad height - int pad_height = dst_height - src_height; - int i = 0; - int pad_width = 0; - int crop_width = 0; - int width = src_width; - if (src_width < dst_width) { - // pad width - pad_width = dst_width - src_width; - } else { - // cut width - crop_width = src_width - dst_width; - width = dst_width; - } - if (pad_height) { - memset(dst_frame, 0, dst_width * (pad_height >> 1)); - dst_frame += dst_width * (pad_height >> 1); - } - for (i = 0; i < src_height;i++) { - if (pad_width) { - memset(dst_frame, 0, pad_width / 2); - dst_frame += pad_width / 2; - } - src_frame += crop_width >> 1; // in case we have a cut - memcpy(dst_frame,src_frame ,width); - src_frame += crop_width >> 1; - dst_frame += width; - src_frame += width; - if (pad_width) { - memset(dst_frame, 0, pad_width / 2); - dst_frame += pad_width / 2; - } - } - if (pad_height) { - memset(dst_frame, 0, dst_width * (pad_height >> 1)); - dst_frame += dst_width * (pad_height >> 1); - } - if (pad_height) { - memset(dst_frame, 127, (dst_width >> 2) * (pad_height >> 1)); - dst_frame += (dst_width >> 2) * (pad_height >> 1); - } - for (i = 0; i < (src_height >> 1); i++) { - if (pad_width) { - memset(dst_frame, 127, pad_width >> 2); - dst_frame += pad_width >> 2; - } - src_frame += crop_width >> 2; // in case we have a cut - memcpy(dst_frame, src_frame,width >> 1); - src_frame += crop_width >> 2; - dst_frame += width >> 1; - src_frame += width >> 1; - if (pad_width) { - memset(dst_frame, 127, pad_width >> 2); - dst_frame += pad_width >> 2; - } - } - if (pad_height) { - memset(dst_frame, 127, (dst_width >> 1) * (pad_height >> 1)); - dst_frame += (dst_width >> 1) * (pad_height >> 1); - } - for (i = 0; i < (src_height >> 1); i++) { - if (pad_width) { - memset(dst_frame, 127, pad_width >> 2); - dst_frame += pad_width >> 2; - } - src_frame += crop_width >> 2; // in case we have a cut - memcpy(dst_frame, src_frame,width >> 1); - src_frame += crop_width >> 2; - dst_frame += width >> 1; - src_frame += width >> 1; - if (pad_width) { - memset(dst_frame, 127, pad_width >> 2); - dst_frame += pad_width >> 2; - } - } - if (pad_height) { - memset(dst_frame, 127, (dst_width >> 2) * (pad_height >> 1)); - dst_frame += (dst_width >> 2) * (pad_height >> 1); - } - } else { - // cut height - int i = 0; - int pad_width = 0; - int crop_width = 0; - int width = src_width; - - if (src_width < dst_width) { - // pad width - pad_width = dst_width - src_width; - } else { - // cut width - crop_width = src_width - dst_width; - width = dst_width; - } - int diff_height = src_height - dst_height; - src_frame += src_width * (diff_height >> 1); // skip top I - - for (i = 0; i < dst_height; i++) { - if (pad_width) { - memset(dst_frame, 0, pad_width / 2); - dst_frame += pad_width / 2; - } - src_frame += crop_width >> 1; // in case we have a cut - memcpy(dst_frame,src_frame ,width); - src_frame += crop_width >> 1; - dst_frame += width; - src_frame += width; - if (pad_width) { - memset(dst_frame, 0, pad_width / 2); - dst_frame += pad_width / 2; - } - } - src_frame += src_width * (diff_height >> 1); // skip end I - src_frame += (src_width >> 2) * (diff_height >> 1); // skip top of Cr - for (i = 0; i < (dst_height >> 1); i++) { - if (pad_width) { - memset(dst_frame, 127, pad_width >> 2); - dst_frame += pad_width >> 2; - } - src_frame += crop_width >> 2; // in case we have a cut - memcpy(dst_frame, src_frame,width >> 1); - src_frame += crop_width >> 2; - dst_frame += width >> 1; - src_frame += width >> 1; - if (pad_width) { - memset(dst_frame, 127, pad_width >> 2); - dst_frame += pad_width >> 2; - } - } - src_frame += (src_width >> 2) * (diff_height >> 1); // skip end of Cr - src_frame += (src_width >> 2) * (diff_height >> 1); // skip top of Cb - for (i = 0; i < (dst_height >> 1); i++) { - if (pad_width) { - memset(dst_frame, 127, pad_width >> 2); - dst_frame += pad_width >> 2; - } - src_frame += crop_width >> 2; // in case we have a cut - memcpy(dst_frame, src_frame, width >> 1); - src_frame += crop_width >> 2; - dst_frame += width >> 1; - src_frame += width >> 1; - if (pad_width) { - memset(dst_frame, 127, pad_width >> 2); - dst_frame += pad_width >> 2; - } - } - } - } - return 0; -} - -} // namespace libyuv diff --git a/files/source/mjpeg_decoder.cc b/files/source/mjpeg_decoder.cc new file mode 100644 index 00000000..aa603947 --- /dev/null +++ b/files/source/mjpeg_decoder.cc @@ -0,0 +1,583 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/mjpeg_decoder.h" + +#ifdef HAVE_JPEG +// Must be included before jpeglib +#include <assert.h> +#ifndef __CLR_VER +#include <setjmp.h> +#define HAVE_SETJMP +#endif +#include <stdio.h> +#include <stdlib.h> + +extern "C" { +#include <jpeglib.h> +} + +#include <climits> +#include <cstring> + +namespace libyuv { + +#ifdef HAVE_SETJMP +struct SetJmpErrorMgr { + jpeg_error_mgr base; // Must be at the top + jmp_buf setjmp_buffer; +}; +#endif + +const int MJpegDecoder::kColorSpaceUnknown = JCS_UNKNOWN; +const int MJpegDecoder::kColorSpaceGrayscale = JCS_GRAYSCALE; +const int MJpegDecoder::kColorSpaceRgb = JCS_RGB; +const int MJpegDecoder::kColorSpaceYCbCr = JCS_YCbCr; +const int MJpegDecoder::kColorSpaceCMYK = JCS_CMYK; +const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK; + +MJpegDecoder::MJpegDecoder() + : has_scanline_padding_(false), + num_outbufs_(0), + scanlines_(NULL), + scanlines_sizes_(NULL), + databuf_(NULL), + databuf_strides_(NULL) { + decompress_struct_ = new jpeg_decompress_struct; + source_mgr_ = new jpeg_source_mgr; +#ifdef HAVE_SETJMP + error_mgr_ = new SetJmpErrorMgr; + decompress_struct_->err = jpeg_std_error(&error_mgr_->base); + // Override standard exit()-based error handler. + error_mgr_->base.error_exit = &ErrorHandler; +#endif + decompress_struct_->client_data = NULL; + source_mgr_->init_source = &init_source; + source_mgr_->fill_input_buffer = &fill_input_buffer; + source_mgr_->skip_input_data = &skip_input_data; + source_mgr_->resync_to_restart = &jpeg_resync_to_restart; + source_mgr_->term_source = &term_source; + jpeg_create_decompress(decompress_struct_); + decompress_struct_->src = source_mgr_; + buf_vec_.buffers = &buf_; + buf_vec_.len = 1; +} + +MJpegDecoder::~MJpegDecoder() { + jpeg_destroy_decompress(decompress_struct_); + delete decompress_struct_; + delete source_mgr_; +#ifdef HAVE_SETJMP + delete error_mgr_; +#endif + DestroyOutputBuffers(); +} + +// Helper function to validate the jpeg looks ok. +// TODO(fbarchard): Improve performance. Scan backward for EOI? +bool ValidateJpeg(const uint8* sample, size_t sample_size) { + if (sample_size < 64) { + // ERROR: Invalid jpeg size: sample_size + return false; + } + if (sample[0] != 0xff || sample[1] != 0xd8) { + // ERROR: Invalid jpeg initial start code + return false; + } + bool soi = true; + int total_eoi = 0; + for (int i = 2; i < static_cast<int>(sample_size) - 1; ++i) { + if (sample[i] == 0xff) { + if (sample[i + 1] == 0xd8) { // Start Of Image + soi = true; + } else if (sample[i + 1] == 0xd9) { // End Of Image + if (soi) { + ++total_eoi; + } + soi = false; + } + } + } + if (!total_eoi) { + // ERROR: Invalid jpeg end code not found. Size sample_size + return false; + } + return true; +} + +bool MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) { + if (!ValidateJpeg(src, src_len)) { + return false; + } + + buf_.data = src; + buf_.len = static_cast<int>(src_len); + buf_vec_.pos = 0; + decompress_struct_->client_data = &buf_vec_; +#ifdef HAVE_SETJMP + if (setjmp(error_mgr_->setjmp_buffer)) { + // We called jpeg_read_header, it experienced an error, and we called + // longjmp() and rewound the stack to here. Return error. + return false; + } +#endif + if (jpeg_read_header(decompress_struct_, TRUE) != JPEG_HEADER_OK) { + // ERROR: Bad MJPEG header + return false; + } + AllocOutputBuffers(GetNumComponents()); + for (int i = 0; i < num_outbufs_; ++i) { + int scanlines_size = GetComponentScanlinesPerImcuRow(i); + if (scanlines_sizes_[i] != scanlines_size) { + if (scanlines_[i]) { + delete scanlines_[i]; + } + scanlines_[i] = new uint8* [scanlines_size]; + scanlines_sizes_[i] = scanlines_size; + } + + // We allocate padding for the final scanline to pad it up to DCTSIZE bytes + // to avoid memory errors, since jpeglib only reads full MCUs blocks. For + // the preceding scanlines, the padding is not needed/wanted because the + // following addresses will already be valid (they are the initial bytes of + // the next scanline) and will be overwritten when jpeglib writes out that + // next scanline. + int databuf_stride = GetComponentStride(i); + int databuf_size = scanlines_size * databuf_stride; + if (databuf_strides_[i] != databuf_stride) { + if (databuf_[i]) { + delete databuf_[i]; + } + databuf_[i] = new uint8[databuf_size]; + databuf_strides_[i] = databuf_stride; + } + + if (GetComponentStride(i) != GetComponentWidth(i)) { + has_scanline_padding_ = true; + } + } + return true; +} + +static int DivideAndRoundUp(int numerator, int denominator) { + return (numerator + denominator - 1) / denominator; +} + +static int DivideAndRoundDown(int numerator, int denominator) { + return numerator / denominator; +} + +// Returns width of the last loaded frame. +int MJpegDecoder::GetWidth() { + return decompress_struct_->image_width; +} + +// Returns height of the last loaded frame. +int MJpegDecoder::GetHeight() { + return decompress_struct_->image_height; +} + +// Returns format of the last loaded frame. The return value is one of the +// kColorSpace* constants. +int MJpegDecoder::GetColorSpace() { + return decompress_struct_->jpeg_color_space; +} + +// Number of color components in the color space. +int MJpegDecoder::GetNumComponents() { + return decompress_struct_->num_components; +} + +// Sample factors of the n-th component. +int MJpegDecoder::GetHorizSampFactor(int component) { + return decompress_struct_->comp_info[component].h_samp_factor; +} + +int MJpegDecoder::GetVertSampFactor(int component) { + return decompress_struct_->comp_info[component].v_samp_factor; +} + +int MJpegDecoder::GetHorizSubSampFactor(int component) { + return decompress_struct_->max_h_samp_factor / + GetHorizSampFactor(component); +} + +int MJpegDecoder::GetVertSubSampFactor(int component) { + return decompress_struct_->max_v_samp_factor / + GetVertSampFactor(component); +} + +int MJpegDecoder::GetImageScanlinesPerImcuRow() { + return decompress_struct_->max_v_samp_factor * DCTSIZE; +} + +int MJpegDecoder::GetComponentScanlinesPerImcuRow(int component) { + int vs = GetVertSubSampFactor(component); + return DivideAndRoundUp(GetImageScanlinesPerImcuRow(), vs); +} + +int MJpegDecoder::GetComponentWidth(int component) { + int hs = GetHorizSubSampFactor(component); + return DivideAndRoundUp(GetWidth(), hs); +} + +int MJpegDecoder::GetComponentHeight(int component) { + int vs = GetVertSubSampFactor(component); + return DivideAndRoundUp(GetHeight(), vs); +} + +// Get width in bytes padded out to a multiple of DCTSIZE +int MJpegDecoder::GetComponentStride(int component) { + return (GetComponentWidth(component) + DCTSIZE - 1) & ~(DCTSIZE - 1); +} + +int MJpegDecoder::GetComponentSize(int component) { + return GetComponentWidth(component) * GetComponentHeight(component); +} + +bool MJpegDecoder::UnloadFrame() { +#ifdef HAVE_SETJMP + if (setjmp(error_mgr_->setjmp_buffer)) { + // We called jpeg_abort_decompress, it experienced an error, and we called + // longjmp() and rewound the stack to here. Return error. + return false; + } +#endif + jpeg_abort_decompress(decompress_struct_); + return true; +} + +static void CopyRows(uint8* source, int source_stride, + uint8* dest, int pixels, int numrows) { + for (int i = 0; i < numrows; ++i) { + memcpy(dest, source, pixels); + dest += pixels; + source += source_stride; + } +} + +// TODO(fbarchard): Allow rectangle to be specified: x, y, width, height. +bool MJpegDecoder::DecodeToBuffers( + uint8** planes, int dst_width, int dst_height) { + if (dst_width != GetWidth() || + dst_height > GetHeight()) { + // ERROR: Bad dimensions + return false; + } +#ifdef HAVE_SETJMP + if (setjmp(error_mgr_->setjmp_buffer)) { + // We called into jpeglib, it experienced an error sometime during this + // function call, and we called longjmp() and rewound the stack to here. + // Return error. + return false; + } +#endif + if (!StartDecode()) { + return false; + } + SetScanlinePointers(databuf_); + int lines_left = dst_height; + // Compute amount of lines to skip to implement vertical crop. + // TODO(fbarchard): Ensure skip is a multiple of maximum component + // subsample. ie 2 + int skip = (GetHeight() - dst_height) / 2; + if (skip > 0) { + // There is no API to skip lines in the output data, so we read them + // into the temp buffer. + while (skip >= GetImageScanlinesPerImcuRow()) { + if (!DecodeImcuRow()) { + FinishDecode(); + return false; + } + skip -= GetImageScanlinesPerImcuRow(); + } + if (skip > 0) { + // Have a partial iMCU row left over to skip. Must read it and then + // copy the parts we want into the destination. + if (!DecodeImcuRow()) { + FinishDecode(); + return false; + } + for (int i = 0; i < num_outbufs_; ++i) { + // TODO(fbarchard): Compute skip to avoid this + assert(skip % GetVertSubSampFactor(i) == 0); + int rows_to_skip = + DivideAndRoundDown(skip, GetVertSubSampFactor(i)); + int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i) - + rows_to_skip; + int data_to_skip = rows_to_skip * GetComponentStride(i); + CopyRows(databuf_[i] + data_to_skip, GetComponentStride(i), + planes[i], GetComponentWidth(i), scanlines_to_copy); + planes[i] += scanlines_to_copy * GetComponentWidth(i); + } + lines_left -= (GetImageScanlinesPerImcuRow() - skip); + } + } + + // Read full MCUs but cropped horizontally + for (; lines_left > GetImageScanlinesPerImcuRow(); + lines_left -= GetImageScanlinesPerImcuRow()) { + if (!DecodeImcuRow()) { + FinishDecode(); + return false; + } + for (int i = 0; i < num_outbufs_; ++i) { + int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i); + CopyRows(databuf_[i], GetComponentStride(i), + planes[i], GetComponentWidth(i), scanlines_to_copy); + planes[i] += scanlines_to_copy * GetComponentWidth(i); + } + } + + if (lines_left > 0) { + // Have a partial iMCU row left over to decode. + if (!DecodeImcuRow()) { + FinishDecode(); + return false; + } + for (int i = 0; i < num_outbufs_; ++i) { + int scanlines_to_copy = + DivideAndRoundUp(lines_left, GetVertSubSampFactor(i)); + CopyRows(databuf_[i], GetComponentStride(i), + planes[i], GetComponentWidth(i), scanlines_to_copy); + planes[i] += scanlines_to_copy * GetComponentWidth(i); + } + } + return FinishDecode(); +} + +bool MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque, + int dst_width, int dst_height) { + if (dst_width != GetWidth() || + dst_height > GetHeight()) { + // ERROR: Bad dimensions + return false; + } +#ifdef HAVE_SETJMP + if (setjmp(error_mgr_->setjmp_buffer)) { + // We called into jpeglib, it experienced an error sometime during this + // function call, and we called longjmp() and rewound the stack to here. + // Return error. + return false; + } +#endif + if (!StartDecode()) { + return false; + } + SetScanlinePointers(databuf_); + int lines_left = dst_height; + // TODO(fbarchard): Compute amount of lines to skip to implement vertical crop + int skip = (GetHeight() - dst_height) / 2; + if (skip > 0) { + while (skip >= GetImageScanlinesPerImcuRow()) { + if (!DecodeImcuRow()) { + FinishDecode(); + return false; + } + skip -= GetImageScanlinesPerImcuRow(); + } + if (skip > 0) { + // Have a partial iMCU row left over to skip. + if (!DecodeImcuRow()) { + FinishDecode(); + return false; + } + for (int i = 0; i < num_outbufs_; ++i) { + // TODO(fbarchard): Compute skip to avoid this + assert(skip % GetVertSubSampFactor(i) == 0); + int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i)); + int data_to_skip = rows_to_skip * GetComponentStride(i); + // Change our own data buffer pointers so we can pass them to the + // callback. + databuf_[i] += data_to_skip; + } + int scanlines_to_copy = GetImageScanlinesPerImcuRow() - skip; + (*fn)(opaque, databuf_, databuf_strides_, scanlines_to_copy); + // Now change them back. + for (int i = 0; i < num_outbufs_; ++i) { + int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i)); + int data_to_skip = rows_to_skip * GetComponentStride(i); + databuf_[i] -= data_to_skip; + } + lines_left -= scanlines_to_copy; + } + } + // Read full MCUs until we get to the crop point. + for (; lines_left >= GetImageScanlinesPerImcuRow(); + lines_left -= GetImageScanlinesPerImcuRow()) { + if (!DecodeImcuRow()) { + FinishDecode(); + return false; + } + (*fn)(opaque, databuf_, databuf_strides_, GetImageScanlinesPerImcuRow()); + } + if (lines_left > 0) { + // Have a partial iMCU row left over to decode. + if (!DecodeImcuRow()) { + FinishDecode(); + return false; + } + (*fn)(opaque, databuf_, databuf_strides_, lines_left); + } + return FinishDecode(); +} + +void MJpegDecoder::init_source(j_decompress_ptr cinfo) { + fill_input_buffer(cinfo); +} + +boolean MJpegDecoder::fill_input_buffer(j_decompress_ptr cinfo) { + BufferVector* buf_vec = static_cast<BufferVector*>(cinfo->client_data); + if (buf_vec->pos >= buf_vec->len) { + assert(0 && "No more data"); + // ERROR: No more data + return FALSE; + } + cinfo->src->next_input_byte = buf_vec->buffers[buf_vec->pos].data; + cinfo->src->bytes_in_buffer = buf_vec->buffers[buf_vec->pos].len; + ++buf_vec->pos; + return TRUE; +} + +void MJpegDecoder::skip_input_data(j_decompress_ptr cinfo, + long num_bytes) { // NOLINT + cinfo->src->next_input_byte += num_bytes; +} + +void MJpegDecoder::term_source(j_decompress_ptr cinfo) { + // Nothing to do. +} + +#ifdef HAVE_SETJMP +void MJpegDecoder::ErrorHandler(j_common_ptr cinfo) { + // This is called when a jpeglib command experiences an error. Unfortunately + // jpeglib's error handling model is not very flexible, because it expects the + // error handler to not return--i.e., it wants the program to terminate. To + // recover from errors we use setjmp() as shown in their example. setjmp() is + // C's implementation for the "call with current continuation" functionality + // seen in some functional programming languages. + char buf[JMSG_LENGTH_MAX]; + (*cinfo->err->format_message)(cinfo, buf); + // ERROR: Error in jpeglib: buf + + SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err); + // This rewinds the call stack to the point of the corresponding setjmp() + // and causes it to return (for a second time) with value 1. + longjmp(mgr->setjmp_buffer, 1); +} +#endif + +void MJpegDecoder::AllocOutputBuffers(int num_outbufs) { + if (num_outbufs != num_outbufs_) { + // We could perhaps optimize this case to resize the output buffers without + // necessarily having to delete and recreate each one, but it's not worth + // it. + DestroyOutputBuffers(); + + scanlines_ = new uint8** [num_outbufs]; + scanlines_sizes_ = new int[num_outbufs]; + databuf_ = new uint8* [num_outbufs]; + databuf_strides_ = new int[num_outbufs]; + + for (int i = 0; i < num_outbufs; ++i) { + scanlines_[i] = NULL; + scanlines_sizes_[i] = 0; + databuf_[i] = NULL; + databuf_strides_[i] = 0; + } + + num_outbufs_ = num_outbufs; + } +} + +void MJpegDecoder::DestroyOutputBuffers() { + for (int i = 0; i < num_outbufs_; ++i) { + delete [] scanlines_[i]; + delete [] databuf_[i]; + } + delete [] scanlines_; + delete [] databuf_; + delete [] scanlines_sizes_; + delete [] databuf_strides_; + scanlines_ = NULL; + databuf_ = NULL; + scanlines_sizes_ = NULL; + databuf_strides_ = NULL; + num_outbufs_ = 0; +} + +// JDCT_IFAST and do_block_smoothing improve performance substantially. +bool MJpegDecoder::StartDecode() { + decompress_struct_->raw_data_out = TRUE; + decompress_struct_->dct_method = JDCT_IFAST; // JDCT_ISLOW is default + decompress_struct_->dither_mode = JDITHER_NONE; + decompress_struct_->do_fancy_upsampling = false; // Not applicable to 'raw' + decompress_struct_->enable_2pass_quant = false; // Only for buffered mode + decompress_struct_->do_block_smoothing = false; // blocky but fast + + if (!jpeg_start_decompress(decompress_struct_)) { + // ERROR: Couldn't start JPEG decompressor"; + return false; + } + return true; +} + +bool MJpegDecoder::FinishDecode() { + // jpeglib considers it an error if we finish without decoding the whole + // image, so we call "abort" rather than "finish". + jpeg_abort_decompress(decompress_struct_); + return true; +} + +void MJpegDecoder::SetScanlinePointers(uint8** data) { + for (int i = 0; i < num_outbufs_; ++i) { + uint8* data_i = data[i]; + for (int j = 0; j < scanlines_sizes_[i]; ++j) { + scanlines_[i][j] = data_i; + data_i += GetComponentStride(i); + } + } +} + +inline bool MJpegDecoder::DecodeImcuRow() { + return static_cast<unsigned int>(GetImageScanlinesPerImcuRow()) == + jpeg_read_raw_data(decompress_struct_, + scanlines_, + GetImageScanlinesPerImcuRow()); +} + +// The helper function which recognizes the jpeg sub-sampling type. +JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper( + int* subsample_x, int* subsample_y, int number_of_components) { + if (number_of_components == 3) { // Color images. + if (subsample_x[0] == 1 && subsample_y[0] == 1 && + subsample_x[1] == 2 && subsample_y[1] == 2 && + subsample_x[2] == 2 && subsample_y[2] == 2) { + return kJpegYuv420; + } else if (subsample_x[0] == 1 && subsample_y[0] == 1 && + subsample_x[1] == 2 && subsample_y[1] == 1 && + subsample_x[2] == 2 && subsample_y[2] == 1) { + return kJpegYuv422; + } else if (subsample_x[0] == 1 && subsample_y[0] == 1 && + subsample_x[1] == 1 && subsample_y[1] == 1 && + subsample_x[2] == 1 && subsample_y[2] == 1) { + return kJpegYuv444; + } + } else if (number_of_components == 1) { // Grey-scale images. + if (subsample_x[0] == 1 && subsample_y[0] == 1) { + return kJpegYuv400; + } + } + return kJpegUnknown; +} + +} // namespace libyuv +#endif // HAVE_JPEG + diff --git a/files/source/planar_functions.cc b/files/source/planar_functions.cc index a7e3e38a..a7f5086a 100644 --- a/files/source/planar_functions.cc +++ b/files/source/planar_functions.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -10,321 +10,104 @@ #include "libyuv/planar_functions.h" -#include <string.h> +#include <string.h> // for memset() #include "libyuv/cpu_id.h" -#include "row.h" +#ifdef HAVE_JPEG +#include "libyuv/mjpeg_decoder.h" +#endif +#include "libyuv/row.h" +#ifdef __cplusplus namespace libyuv { - -#if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED) -#define HAS_SPLITUV_NEON -// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v -// Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels. -static void SplitUV_NEON(const uint8* src_uv, - uint8* dst_u, uint8* dst_v, int pix) { - __asm__ volatile - ( - "1:\n" - "vld2.u8 {q0,q1}, [%0]! \n" // load 16 pairs of UV - "vst1.u8 {q0}, [%1]! \n" // store U - "vst1.u8 {q1}, [%2]! \n" // Store V - "subs %3, %3, #16 \n" // 16 processed per loop - "bhi 1b \n" - : "+r"(src_uv), - "+r"(dst_u), - "+r"(dst_v), - "+r"(pix) // Output registers - : // Input registers - : "q0", "q1" // Clobber List - ); -} - -#elif (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \ - && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) -#if defined(_MSC_VER) -#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var -#else -#define TALIGN16(t, var) t var __attribute__((aligned(16))) +extern "C" { #endif -// Shuffle table for converting ABGR to ARGB. -extern "C" TALIGN16(const uint8, kShuffleMaskABGRToARGB[16]) = { - 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u -}; - -// Shuffle table for converting BGRA to ARGB. -extern "C" TALIGN16(const uint8, kShuffleMaskBGRAToARGB[16]) = { - 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u -}; - -#if defined(WIN32) && !defined(COVERAGE_ENABLED) -#define HAS_SPLITUV_SSE2 -__declspec(naked) -static void SplitUV_SSE2(const uint8* src_uv, - uint8* dst_u, uint8* dst_v, int pix) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_uv - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // pix - pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff - psrlw xmm7, 8 - - wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - pand xmm0, xmm7 // even bytes - pand xmm1, xmm7 - packuswb xmm0, xmm1 - movdqa [edx], xmm0 - lea edx, [edx + 16] - psrlw xmm2, 8 // odd bytes - psrlw xmm3, 8 - packuswb xmm2, xmm3 - movdqa [edi], xmm2 - lea edi, [edi + 16] - sub ecx, 16 - ja wloop - pop edi - ret +// Copy a plane of data +LIBYUV_API +void CopyPlane(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + int width, int height) { + void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) { + CopyRow = CopyRow_NEON; } -} - -#elif (defined(__x86_64__) || defined(__i386__)) && \ - !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) -#define HAS_SPLITUV_SSE2 -static void SplitUV_SSE2(const uint8* src_uv, - uint8* dst_u, uint8* dst_v, int pix) { - asm volatile( - "pcmpeqb %%xmm7,%%xmm7\n" - "psrlw $0x8,%%xmm7\n" -"1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "lea 0x20(%0),%0\n" - "movdqa %%xmm0,%%xmm2\n" - "movdqa %%xmm1,%%xmm3\n" - "pand %%xmm7,%%xmm0\n" - "pand %%xmm7,%%xmm1\n" - "packuswb %%xmm1,%%xmm0\n" - "movdqa %%xmm0,(%1)\n" - "lea 0x10(%1),%1\n" - "psrlw $0x8,%%xmm2\n" - "psrlw $0x8,%%xmm3\n" - "packuswb %%xmm3,%%xmm2\n" - "movdqa %%xmm2,(%2)\n" - "lea 0x10(%2),%2\n" - "sub $0x10,%3\n" - "ja 1b\n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(pix) // %3 - : - : "memory" -); -} #endif +#if defined(HAS_COPYROW_X86) + if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) { + CopyRow = CopyRow_X86; + } #endif - -static void SplitUV_C(const uint8* src_uv, - uint8* dst_u, uint8* dst_v, int pix) { - // Copy a row of UV. - for (int x = 0; x < pix; ++x) { - dst_u[0] = src_uv[0]; - dst_v[0] = src_uv[1]; - src_uv += 2; - dst_u += 1; - dst_v += 1; +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) && + IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && + IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + CopyRow = CopyRow_SSE2; } -} +#endif -static void I420CopyPlane(const uint8* src_y, int src_stride_y, - uint8* dst_y, int dst_stride_y, - int width, int height) { // Copy plane for (int y = 0; y < height; ++y) { - memcpy(dst_y, src_y, width); + CopyRow(src_y, dst_y, width); src_y += src_stride_y; dst_y += dst_stride_y; } } -// Copy I420 with optional flipping -int I420Copy(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { - if (!src_y || !src_u || !src_v || - !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { +// Convert I420 to I400. +LIBYUV_API +int I420ToI400(const uint8* src_y, int src_stride_y, + uint8*, int, // src_u + uint8*, int, // src_v + uint8* dst_y, int dst_stride_y, + int width, int height) { + if (!src_y || !dst_y || width <= 0 || height == 0) { return -1; } - // Negative height means invert the image. if (height < 0) { height = -height; - int halfheight = (height + 1) >> 1; src_y = src_y + (height - 1) * src_stride_y; - src_u = src_u + (halfheight - 1) * src_stride_u; - src_v = src_v + (halfheight - 1) * src_stride_v; src_stride_y = -src_stride_y; - src_stride_u = -src_stride_u; - src_stride_v = -src_stride_v; } - - int halfwidth = (width + 1) >> 1; - int halfheight = (height + 1) >> 1; - I420CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - I420CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); - I420CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); return 0; } -// SetRows32 writes 'count' bytes using a 32 bit value repeated - -#if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED) -#define HAS_SETROW_NEON -static void SetRow32_NEON(uint8* dst, uint32 v32, int count) { - __asm__ volatile - ( - "vdup.u32 q0, %2 \n" // duplicate 4 ints - "1:\n" - "vst1.u32 {q0}, [%0]! \n" // store - "subs %1, %1, #16 \n" // 16 processed per loop - "bhi 1b \n" - : "+r"(dst), // %0 - "+r"(count) // %1 - : "r"(v32) // %2 - : "q0", "memory" - ); -} - -#elif defined(WIN32) && !defined(COVERAGE_ENABLED) -#define HAS_SETROW_SSE2 -__declspec(naked) -static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) { - __asm { - mov eax, [esp + 4] // dst - movd xmm7, [esp + 8] // v32 - mov ecx, [esp + 12] // count - pshufd xmm7, xmm7, 0 - - wloop: - movdqa [eax], xmm7 - lea eax, [eax + 16] - sub ecx, 16 - ja wloop - ret +// Mirror a plane of data +void MirrorPlane(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + int width, int height) { + void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C; +#if defined(HAS_MIRRORROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { + MirrorRow = MirrorRow_NEON; } -} - -#elif (defined(__x86_64__) || defined(__i386__)) && \ - !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) - -#define HAS_SETROW_SSE2 -static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) { - asm volatile( - "movd %2, %%xmm7\n" - "pshufd $0x0,%%xmm7,%%xmm7\n" -"1:" - "movdqa %%xmm7,(%0)\n" - "lea 0x10(%0),%0\n" - "sub $0x10,%1\n" - "ja 1b\n" - : "+r"(dst), // %0 - "+r"(count) // %1 - : "r"(v32) // %2 - : "memory" -); -} #endif - -static void SetRow8_C(uint8* dst, uint32 v8, int count) { - memset(dst, v8, count); -} - -static void I420SetPlane(uint8* dst_y, int dst_stride_y, - int width, int height, - int value) { - void (*SetRow)(uint8* dst, uint32 value, int pix); -#if defined(HAS_SETROW_NEON) - if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) && - (width % 16 == 0) && - IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { - SetRow = SetRow32_NEON; - } else -#elif defined(HAS_SETROW_SSE2) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && - (width % 16 == 0) && - IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { - SetRow = SetRow32_SSE2; - } else +#if defined(HAS_MIRRORROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) { + MirrorRow = MirrorRow_SSE2; +#if defined(HAS_MIRRORROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16)) { + MirrorRow = MirrorRow_SSSE3; + } #endif - { - SetRow = SetRow8_C; } +#endif - uint32 v32 = value | (value << 8) | (value << 16) | (value << 24); - // Set plane + // Mirror plane for (int y = 0; y < height; ++y) { - SetRow(dst_y, v32, width); + MirrorRow(src_y, dst_y, width); + src_y += src_stride_y; dst_y += dst_stride_y; } } -// Draw a rectangle into I420 -int I420Rect(uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int x, int y, - int width, int height, - int value_y, int value_u, int value_v) { - if (!dst_y || !dst_u || !dst_v || - width <= 0 || height == 0 || - x < 0 || y < 0 || - value_y < 0 || value_y > 255 || - value_u < 0 || value_u > 255 || - value_v < 0 || value_v > 255) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - int halfheight = (height + 1) >> 1; - dst_y = dst_y + (height - 1) * dst_stride_y; - dst_u = dst_u + (halfheight - 1) * dst_stride_u; - dst_v = dst_v + (halfheight - 1) * dst_stride_v; - dst_stride_y = -dst_stride_y; - dst_stride_u = -dst_stride_u; - dst_stride_v = -dst_stride_v; - } - - int halfwidth = (width + 1) >> 1; - int halfheight = (height + 1) >> 1; - uint8* start_y = dst_y + y * dst_stride_y + x; - uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2); - uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2); - - I420SetPlane(start_y, dst_stride_y, width, height, value_y); - I420SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u); - I420SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v); - return 0; -} - -// Helper function to copy yuv data without scaling. Used -// by our jpeg conversion callbacks to incrementally fill a yuv image. -int I422ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, +// Convert YUY2 to I422. +LIBYUV_API +int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, @@ -332,1244 +115,1314 @@ int I422ToI420(const uint8* src_y, int src_stride_y, // Negative height means invert the image. if (height < 0) { height = -height; - src_y = src_y + (height - 1) * src_stride_y; - src_u = src_u + (height - 1) * src_stride_u; - src_v = src_v + (height - 1) * src_stride_v; - src_stride_y = -src_stride_y; - src_stride_u = -src_stride_u; - src_stride_v = -src_stride_v; + src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; + src_stride_yuy2 = -src_stride_yuy2; } - - // Copy Y plane - I420CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - - // SubSample UV planes. - int x, y; - int halfwidth = (width + 1) >> 1; - for (y = 0; y < height; y += 2) { - const uint8* u0 = src_u; - const uint8* u1 = src_u + src_stride_u; - if ((y + 1) >= height) { - u1 = u0; + void (*YUY2ToUV422Row)(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); + void (*YUY2ToYRow)(const uint8* src_yuy2, + uint8* dst_y, int pix); + YUY2ToYRow = YUY2ToYRow_C; + YUY2ToUV422Row = YUY2ToUV422Row_C; +#if defined(HAS_YUY2TOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + if (width > 16) { + YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2; + YUY2ToYRow = YUY2ToYRow_Any_SSE2; } - for (x = 0; x < halfwidth; ++x) { - dst_u[x] = (u0[x] + u1[x] + 1) >> 1; + if (IS_ALIGNED(width, 16)) { + YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2; + YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2; + if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) { + YUY2ToUV422Row = YUY2ToUV422Row_SSE2; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + YUY2ToYRow = YUY2ToYRow_SSE2; + } + } } - src_u += src_stride_u * 2; - dst_u += dst_stride_u; } - for (y = 0; y < height; y += 2) { - const uint8* v0 = src_v; - const uint8* v1 = src_v + src_stride_v; - if ((y + 1) >= height) { - v1 = v0; +#elif defined(HAS_YUY2TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + if (width > 8) { + YUY2ToYRow = YUY2ToYRow_Any_NEON; + if (width > 16) { + YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON; + } } - for (x = 0; x < halfwidth; ++x) { - dst_v[x] = (v0[x] + v1[x] + 1) >> 1; + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_NEON; + YUY2ToUV422Row = YUY2ToUV422Row_NEON; } - src_v += src_stride_v * 2; - dst_v += dst_stride_v; - } - return 0; -} - -static void I420CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, - uint8* dst, int dst_stride_frame, - int width, int height) { - // Copy plane - for (int y = 0; y < height; y += 2) { - memcpy(dst, src, width); - src += src_stride_0; - dst += dst_stride_frame; - memcpy(dst, src, width); - src += src_stride_1; - dst += dst_stride_frame; } -} - -// Support converting from FOURCC_M420 -// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for -// easy conversion to I420. -// M420 format description: -// M420 is row biplanar 420: 2 rows of Y and 1 row of VU. -// Chroma is half width / half height. (420) -// src_stride_m420 is row planar. Normally this will be the width in pixels. -// The UV plane is half width, but 2 values, so src_stride_m420 applies to -// this as well as the two Y planes. -static int X420ToI420(const uint8* src_y, - int src_stride_y0, int src_stride_y1, - const uint8* src_uv, int src_stride_uv, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { - // Negative height means invert the image. - if (height < 0) { - height = -height; - int halfheight = (height + 1) >> 1; - dst_y = dst_y + (height - 1) * dst_stride_y; - dst_u = dst_u + (halfheight - 1) * dst_stride_u; - dst_v = dst_v + (halfheight - 1) * dst_stride_v; - dst_stride_y = -dst_stride_y; - dst_stride_u = -dst_stride_u; - dst_stride_v = -dst_stride_v; - } - - int halfwidth = (width + 1) >> 1; - void (*SplitUV)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); -#if defined(HAS_SPLITUV_NEON) - if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) && - (halfwidth % 16 == 0) && - IS_ALIGNED(src_uv, 16) && (src_stride_uv % 16 == 0) && - IS_ALIGNED(dst_u, 16) && (dst_stride_u % 16 == 0) && - IS_ALIGNED(dst_v, 16) && (dst_stride_v % 16 == 0)) { - SplitUV = SplitUV_NEON; - } else -#elif defined(HAS_SPLITUV_SSE2) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && - (halfwidth % 16 == 0) && - IS_ALIGNED(src_uv, 16) && (src_stride_uv % 16 == 0) && - IS_ALIGNED(dst_u, 16) && (dst_stride_u % 16 == 0) && - IS_ALIGNED(dst_v, 16) && (dst_stride_v % 16 == 0)) { - SplitUV = SplitUV_SSE2; - } else #endif - { - SplitUV = SplitUV_C; - } - - I420CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y, - width, height); - int halfheight = (height + 1) >> 1; - for (int y = 0; y < halfheight; ++y) { - // Copy a row of UV. - SplitUV(src_uv, dst_u, dst_v, halfwidth); + for (int y = 0; y < height; ++y) { + YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width); + YUY2ToYRow(src_yuy2, dst_y, width); + src_yuy2 += src_stride_yuy2; + dst_y += dst_stride_y; dst_u += dst_stride_u; dst_v += dst_stride_v; - src_uv += src_stride_uv; } return 0; } -// Convert M420 to I420. -int M420ToI420(const uint8* src_m420, int src_stride_m420, +// Convert UYVY to I422. +LIBYUV_API +int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int width, int height) { - return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2, - src_m420 + src_stride_m420 * 2, src_stride_m420 * 3, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - width, height); -} - -// Convert NV12 to I420. -int NV12ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_uv, int src_stride_uv, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { - return X420ToI420(src_y, src_stride_y, src_stride_y, - src_uv, src_stride_uv, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - width, height); -} - -// Convert NV12 to I420. Deprecated. -int NV12ToI420(const uint8* src_y, - const uint8* src_uv, - int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { - return X420ToI420(src_y, src_stride_frame, src_stride_frame, - src_uv, src_stride_frame, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - width, height); -} - -#if defined(WIN32) && !defined(COVERAGE_ENABLED) -#define HAS_SPLITYUY2_SSE2 -__declspec(naked) -static void SplitYUY2_SSE2(const uint8* src_yuy2, - uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov edx, [esp + 8 + 8] // dst_y - mov esi, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix - pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff - psrlw xmm7, 8 - - wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - pand xmm2, xmm7 // even bytes are Y - pand xmm3, xmm7 - packuswb xmm2, xmm3 - movdqa [edx], xmm2 - lea edx, [edx + 16] - psrlw xmm0, 8 // YUYV -> UVUV - psrlw xmm1, 8 - packuswb xmm0, xmm1 - movdqa xmm1, xmm0 - pand xmm0, xmm7 // U - packuswb xmm0, xmm0 - movq qword ptr [esi], xmm0 - lea esi, [esi + 8] - psrlw xmm1, 8 // V - packuswb xmm1, xmm1 - movq qword ptr [edi], xmm1 - lea edi, [edi + 8] - sub ecx, 16 - ja wloop - - pop edi - pop esi - ret + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; + src_stride_uyvy = -src_stride_uyvy; + } + void (*UYVYToUV422Row)(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); + void (*UYVYToYRow)(const uint8* src_uyvy, + uint8* dst_y, int pix); + UYVYToYRow = UYVYToYRow_C; + UYVYToUV422Row = UYVYToUV422Row_C; +#if defined(HAS_UYVYTOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + if (width > 16) { + UYVYToUV422Row = UYVYToUV422Row_Any_SSE2; + UYVYToYRow = UYVYToYRow_Any_SSE2; + } + if (IS_ALIGNED(width, 16)) { + UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2; + UYVYToYRow = UYVYToYRow_Unaligned_SSE2; + if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) { + UYVYToUV422Row = UYVYToUV422Row_SSE2; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + UYVYToYRow = UYVYToYRow_SSE2; + } + } + } + } +#elif defined(HAS_UYVYTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + if (width > 8) { + UYVYToYRow = UYVYToYRow_Any_NEON; + if (width > 16) { + UYVYToUV422Row = UYVYToUV422Row_Any_NEON; + } + } + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_NEON; + UYVYToUV422Row = UYVYToUV422Row_NEON; + } } -} - -#elif (defined(__x86_64__) || defined(__i386__)) && \ - !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) -#define HAS_SPLITYUY2_SSE2 -static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y, - uint8* dst_u, uint8* dst_v, int pix) { - asm volatile( - "pcmpeqb %%xmm7,%%xmm7\n" - "psrlw $0x8,%%xmm7\n" -"1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "lea 0x20(%0),%0\n" - "movdqa %%xmm0,%%xmm2\n" - "movdqa %%xmm1,%%xmm3\n" - "pand %%xmm7,%%xmm2\n" - "pand %%xmm7,%%xmm3\n" - "packuswb %%xmm3,%%xmm2\n" - "movdqa %%xmm2,(%1)\n" - "lea 0x10(%1),%1\n" - "psrlw $0x8,%%xmm0\n" - "psrlw $0x8,%%xmm1\n" - "packuswb %%xmm1,%%xmm0\n" - "movdqa %%xmm0,%%xmm1\n" - "pand %%xmm7,%%xmm0\n" - "packuswb %%xmm0,%%xmm0\n" - "movq %%xmm0,(%2)\n" - "lea 0x8(%2),%2\n" - "psrlw $0x8,%%xmm1\n" - "packuswb %%xmm1,%%xmm1\n" - "movq %%xmm1,(%3)\n" - "lea 0x8(%3),%3\n" - "sub $0x10,%4\n" - "ja 1b\n" - : "+r"(src_yuy2), // %0 - "+r"(dst_y), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(pix) // %4 - : - : "memory" -); -} #endif -static void SplitYUY2_C(const uint8* src_yuy2, - uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix) { - // Copy a row of YUY2. - for (int x = 0; x < pix; x += 2) { - dst_y[0] = src_yuy2[0]; - dst_y[1] = src_yuy2[2]; - dst_u[0] = src_yuy2[1]; - dst_v[0] = src_yuy2[3]; - src_yuy2 += 4; - dst_y += 2; - dst_u += 1; - dst_v += 1; + for (int y = 0; y < height; ++y) { + UYVYToUV422Row(src_uyvy, dst_u, dst_v, width); + UYVYToYRow(src_uyvy, dst_y, width); + src_uyvy += src_stride_uyvy; + dst_y += dst_stride_y; + dst_u += dst_stride_u; + dst_v += dst_stride_v; } + return 0; } -// Convert Q420 to I420. -// Format is rows of YY/YUYV -int Q420ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_yuy2, int src_stride_yuy2, +// Mirror I420 with optional flipping +LIBYUV_API +int I420Mirror(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int width, int height) { + if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } // Negative height means invert the image. if (height < 0) { height = -height; int halfheight = (height + 1) >> 1; - dst_y = dst_y + (height - 1) * dst_stride_y; - dst_u = dst_u + (halfheight - 1) * dst_stride_u; - dst_v = dst_v + (halfheight - 1) * dst_stride_v; - dst_stride_y = -dst_stride_y; - dst_stride_u = -dst_stride_u; - dst_stride_v = -dst_stride_v; - } - void (*SplitYUY2)(const uint8* src_yuy2, - uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix); -#if defined(HAS_SPLITYUY2_SSE2) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && - (width % 16 == 0) && - IS_ALIGNED(src_yuy2, 16) && (src_stride_yuy2 % 16 == 0) && - IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) && - IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && - IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) { - SplitYUY2 = SplitYUY2_SSE2; - } else -#endif - { - SplitYUY2 = SplitYUY2_C; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; } - for (int y = 0; y < height; y += 2) { - memcpy(dst_y, src_y, width); - dst_y += dst_stride_y; - src_y += src_stride_y; - // Copy a row of YUY2. - SplitYUY2(src_yuy2, dst_y, dst_u, dst_v, width); - dst_y += dst_stride_y; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - src_yuy2 += src_stride_yuy2; + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (dst_y) { + MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); } + MirrorPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); + MirrorPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); return 0; } -#if defined(WIN32) && !defined(COVERAGE_ENABLED) -#define HAS_YUY2TOI420ROW_SSE2 -__declspec(naked) -void YUY2ToI420RowY_SSE2(const uint8* src_yuy2, - uint8* dst_y, int pix) { - __asm { - mov eax, [esp + 4] // src_yuy2 - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // pix - pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff - psrlw xmm7, 8 - - wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - pand xmm0, xmm7 // even bytes are Y - pand xmm1, xmm7 - packuswb xmm0, xmm1 - movdqa [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - ja wloop - ret +// ARGB mirror. +LIBYUV_API +int ARGBMirror(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; } -} - -__declspec(naked) -void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_y, int pix) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // stride_yuy2 - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix - pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff - psrlw xmm7, 8 - - wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + esi] - movdqa xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm2 - pavgb xmm1, xmm3 - psrlw xmm0, 8 // YUYV -> UVUV - psrlw xmm1, 8 - packuswb xmm0, xmm1 - movdqa xmm1, xmm0 - pand xmm0, xmm7 // U - packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 - lea edx, [edx + 8] - psrlw xmm1, 8 // V - packuswb xmm1, xmm1 - movq qword ptr [edi], xmm1 - lea edi, [edi + 8] - sub ecx, 16 - ja wloop - - pop edi - pop esi - ret + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; } -} -#define HAS_UYVYTOI420ROW_SSE2 -__declspec(naked) -void UYVYToI420RowY_SSE2(const uint8* src_uyvy, - uint8* dst_y, int pix) { - __asm { - mov eax, [esp + 4] // src_uyvy - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // pix - - wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - psrlw xmm0, 8 // odd bytes are Y - psrlw xmm1, 8 - packuswb xmm0, xmm1 - movdqa [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - ja wloop - ret + void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) = + ARGBMirrorRow_C; +#if defined(HAS_ARGBMIRRORROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBMirrorRow = ARGBMirrorRow_SSSE3; } -} +#endif -__declspec(naked) -void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_y, int pix) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // stride_yuy2 - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix - pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff - psrlw xmm7, 8 - - wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + esi] - movdqa xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm2 - pavgb xmm1, xmm3 - pand xmm0, xmm7 // UYVY -> UVUV - pand xmm1, xmm7 - packuswb xmm0, xmm1 - movdqa xmm1, xmm0 - pand xmm0, xmm7 // U - packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 - lea edx, [edx + 8] - psrlw xmm1, 8 // V - packuswb xmm1, xmm1 - movq qword ptr [edi], xmm1 - lea edi, [edi + 8] - sub ecx, 16 - ja wloop - - pop edi - pop esi - ret + // Mirror plane + for (int y = 0; y < height; ++y) { + ARGBMirrorRow(src_argb, dst_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; } + return 0; } -#elif (defined(__x86_64__) || defined(__i386__)) && \ - !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) - -#define HAS_YUY2TOI420ROW_SSE2 -static void YUY2ToI420RowY_SSE2(const uint8* src_yuy2, - uint8* dst_y, int pix) { - asm volatile( - "pcmpeqb %%xmm7,%%xmm7\n" - "psrlw $0x8,%%xmm7\n" -"1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "lea 0x20(%0),%0\n" - "pand %%xmm7,%%xmm0\n" - "pand %%xmm7,%%xmm1\n" - "packuswb %%xmm1,%%xmm0\n" - "movdqa %%xmm0,(%1)\n" - "lea 0x10(%1),%1\n" - "sub $0x10,%2\n" - "ja 1b\n" - : "+r"(src_yuy2), // %0 - "+r"(dst_y), // %1 - "+r"(pix) // %2 - : - : "memory" -); -} - -static void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_y, int pix) { - asm volatile( - "pcmpeqb %%xmm7,%%xmm7\n" - "psrlw $0x8,%%xmm7\n" -"1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "movdqa (%0,%4,1),%%xmm2\n" - "movdqa 0x10(%0,%4,1),%%xmm3\n" - "lea 0x20(%0),%0\n" - "pavgb %%xmm2,%%xmm0\n" - "pavgb %%xmm3,%%xmm1\n" - "psrlw $0x8,%%xmm0\n" - "psrlw $0x8,%%xmm1\n" - "packuswb %%xmm1,%%xmm0\n" - "movdqa %%xmm0,%%xmm1\n" - "pand %%xmm7,%%xmm0\n" - "packuswb %%xmm0,%%xmm0\n" - "movq %%xmm0,(%1)\n" - "lea 0x8(%1),%1\n" - "psrlw $0x8,%%xmm1\n" - "packuswb %%xmm1,%%xmm1\n" - "movq %%xmm1,(%2)\n" - "lea 0x8(%2),%2\n" - "sub $0x10,%3\n" - "ja 1b\n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_y), // %2 - "+r"(pix) // %3 - : "r"(static_cast<intptr_t>(stride_yuy2)) // %4 - : "memory" -); -} -#define HAS_UYVYTOI420ROW_SSE2 -static void UYVYToI420RowY_SSE2(const uint8* src_uyvy, - uint8* dst_y, int pix) { - asm volatile( -"1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "lea 0x20(%0),%0\n" - "psrlw $0x8,%%xmm0\n" - "psrlw $0x8,%%xmm1\n" - "packuswb %%xmm1,%%xmm0\n" - "movdqa %%xmm0,(%1)\n" - "lea 0x10(%1),%1\n" - "sub $0x10,%2\n" - "ja 1b\n" - : "+r"(src_uyvy), // %0 - "+r"(dst_y), // %1 - "+r"(pix) // %2 - : - : "memory" -); -} - -static void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_y, int pix) { - asm volatile( - "pcmpeqb %%xmm7,%%xmm7\n" - "psrlw $0x8,%%xmm7\n" -"1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "movdqa (%0,%4,1),%%xmm2\n" - "movdqa 0x10(%0,%4,1),%%xmm3\n" - "lea 0x20(%0),%0\n" - "pavgb %%xmm2,%%xmm0\n" - "pavgb %%xmm3,%%xmm1\n" - "pand %%xmm7,%%xmm0\n" - "pand %%xmm7,%%xmm1\n" - "packuswb %%xmm1,%%xmm0\n" - "movdqa %%xmm0,%%xmm1\n" - "pand %%xmm7,%%xmm0\n" - "packuswb %%xmm0,%%xmm0\n" - "movq %%xmm0,(%1)\n" - "lea 0x8(%1),%1\n" - "psrlw $0x8,%%xmm1\n" - "packuswb %%xmm1,%%xmm1\n" - "movq %%xmm1,(%2)\n" - "lea 0x8(%2),%2\n" - "sub $0x10,%3\n" - "ja 1b\n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_y), // %2 - "+r"(pix) // %3 - : "r"(static_cast<intptr_t>(stride_uyvy)) // %4 - : "memory" -); -} +// Get a blender that optimized for the CPU, alignment and pixel count. +// As there are 6 blenders to choose from, the caller should try to use +// the same blend function for all pixels if possible. +LIBYUV_API +ARGBBlendRow GetARGBBlend() { + void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width) = ARGBBlendRow_C; +#if defined(HAS_ARGBBLENDROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBBlendRow = ARGBBlendRow_SSSE3; + return ARGBBlendRow; + } #endif - -// Filter 2 rows of YUY2 UV's (422) into U and V (420) -void YUY2ToI420RowUV_C(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_u, uint8* dst_v, int pix) { - // Output a row of UV values, filtering 2 rows of YUY2 - for (int x = 0; x < pix; x += 2) { - dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1; - dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1; - src_yuy2 += 4; - dst_u += 1; - dst_v += 1; +#if defined(HAS_ARGBBLENDROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBBlendRow = ARGBBlendRow_SSE2; } +#endif + return ARGBBlendRow; } -void YUY2ToI420RowY_C(const uint8* src_yuy2, - uint8* dst_y, int pix) { - // Copy a row of yuy2 Y values - for (int x = 0; x < pix; ++x) { - dst_y[0] = src_yuy2[0]; - src_yuy2 += 2; - dst_y += 1; +// Alpha Blend 2 ARGB images and store to destination. +LIBYUV_API +int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { + return -1; } -} - -void UYVYToI420RowUV_C(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_u, uint8* dst_v, int pix) { - // Copy a row of uyvy UV values - for (int x = 0; x < pix; x += 2) { - dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1; - dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1; - src_uyvy += 4; - dst_u += 1; - dst_v += 1; + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; } -} + void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width) = GetARGBBlend(); -void UYVYToI420RowY_C(const uint8* src_uyvy, - uint8* dst_y, int pix) { - // Copy a row of uyvy Y values - for (int x = 0; x < pix; ++x) { - dst_y[0] = src_uyvy[1]; - src_uyvy += 2; - dst_y += 1; + for (int y = 0; y < height; ++y) { + ARGBBlendRow(src_argb0, src_argb1, dst_argb, width); + src_argb0 += src_stride_argb0; + src_argb1 += src_stride_argb1; + dst_argb += dst_stride_argb; } + return 0; } -// Convert YUY2 to I420. -int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, +// Convert ARGB to I400. +LIBYUV_API +int ARGBToI400(const uint8* src_argb, int src_stride_argb, uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, int width, int height) { - // Negative height means invert the image. + if (!src_argb || !dst_y || width <= 0 || height == 0) { + return -1; + } if (height < 0) { height = -height; - src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; - src_stride_yuy2 = -src_stride_yuy2; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; } - void (*YUY2ToI420RowUV)(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_u, uint8* dst_v, int pix); - void (*YUY2ToI420RowY)(const uint8* src_yuy2, - uint8* dst_y, int pix); -#if defined(HAS_YUY2TOI420ROW_SSE2) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && - (width % 16 == 0) && - IS_ALIGNED(src_yuy2, 16) && (src_stride_yuy2 % 16 == 0) && - IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) && - IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && - IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) { - YUY2ToI420RowY = YUY2ToI420RowY_SSE2; - YUY2ToI420RowUV = YUY2ToI420RowUV_SSE2; - } else -#endif - { - YUY2ToI420RowY = YUY2ToI420RowY_C; - YUY2ToI420RowUV = YUY2ToI420RowUV_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + IS_ALIGNED(width, 4) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; } +#endif + for (int y = 0; y < height; ++y) { - if ((y & 1) == 0) { - if (y >= (height - 1) ) { // last chroma on odd height clamp height - src_stride_yuy2 = 0; - } - YUY2ToI420RowUV(src_yuy2, src_stride_yuy2, dst_u, dst_v, width); - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - YUY2ToI420RowY(src_yuy2, dst_y, width); + ARGBToYRow(src_argb, dst_y, width); + src_argb += src_stride_argb; dst_y += dst_stride_y; - src_yuy2 += src_stride_yuy2; } return 0; } -// Convert UYVY to I420. -int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, +// ARGB little endian (bgra in memory) to I422 +// same as I420 except UV plane is full height +LIBYUV_API +int ARGBToI422(const uint8* src_argb, int src_stride_argb, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int width, int height) { - // Negative height means invert the image. + if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } if (height < 0) { height = -height; - src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; - src_stride_uyvy = -src_stride_uyvy; - } - void (*UYVYToI420RowUV)(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_u, uint8* dst_v, int pix); - void (*UYVYToI420RowY)(const uint8* src_uyvy, - uint8* dst_y, int pix); -#if defined(HAS_UYVYTOI420ROW_SSE2) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && - (width % 16 == 0) && - IS_ALIGNED(src_uyvy, 16) && (src_stride_uyvy % 16 == 0) && - IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) && - IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && - IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) { - UYVYToI420RowY = UYVYToI420RowY_SSE2; - UYVYToI420RowUV = UYVYToI420RowUV_SSE2; - } else -#endif - { - UYVYToI420RowY = UYVYToI420RowY_C; - UYVYToI420RowUV = UYVYToI420RowUV_C; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; } - for (int y = 0; y < height; ++y) { - if ((y & 1) == 0) { - if (y >= (height - 1) ) { // last chroma on odd height clamp height - src_stride_uyvy = 0; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + if (width > 16) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; + } + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3; + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } } - UYVYToI420RowUV(src_uyvy, src_stride_uyvy, dst_u, dst_v, width); - dst_u += dst_stride_u; - dst_v += dst_stride_v; } - UYVYToI420RowY(src_uyvy, dst_y, width); + } +#endif + + for (int y = 0; y < height; ++y) { + ARGBToUVRow(src_argb, 0, dst_u, dst_v, width); + ARGBToYRow(src_argb, dst_y, width); + src_argb += src_stride_argb; dst_y += dst_stride_y; - src_uyvy += src_stride_uyvy; + dst_u += dst_stride_u; + dst_v += dst_stride_v; } return 0; } -// Convert I420 to ARGB. -// TODO(fbarchard): Add SSE2 version and supply C version for fallback. -int I420ToARGB(const uint8* src_y, int src_stride_y, +// Convert I422 to BGRA. +LIBYUV_API +int I422ToBGRA(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, + uint8* dst_bgra, int dst_stride_bgra, int width, int height) { + if (!src_y || !src_u || !src_v || + !dst_bgra || + width <= 0 || height == 0) { + return -1; + } // Negative height means invert the image. if (height < 0) { height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; + dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra; + dst_stride_bgra = -dst_stride_bgra; + } + void (*I422ToBGRARow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToBGRARow_C; +#if defined(HAS_I422TOBGRAROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToBGRARow = I422ToBGRARow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToBGRARow = I422ToBGRARow_NEON; + } + } +#elif defined(HAS_I422TOBGRAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToBGRARow = I422ToBGRARow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) { + I422ToBGRARow = I422ToBGRARow_SSSE3; + } + } } +#endif + for (int y = 0; y < height; ++y) { - FastConvertYUVToRGB32Row(src_y, src_u, src_v, dst_argb, width); - dst_argb += dst_stride_argb; + I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width); + dst_bgra += dst_stride_bgra; src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } + src_u += src_stride_u; + src_v += src_stride_v; } - // MMX used for FastConvertYUVToRGB32Row requires an emms instruction. - EMMS(); return 0; } -// Convert I420 to BGRA. -int I420ToBGRA(const uint8* src_y, int src_stride_y, +// Convert I422 to ABGR. +LIBYUV_API +int I422ToABGR(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, + uint8* dst_abgr, int dst_stride_abgr, int width, int height) { + if (!src_y || !src_u || !src_v || + !dst_abgr || + width <= 0 || height == 0) { + return -1; + } // Negative height means invert the image. if (height < 0) { height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; + dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr; + dst_stride_abgr = -dst_stride_abgr; + } + void (*I422ToABGRRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToABGRRow_C; +#if defined(HAS_I422TOABGRROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToABGRRow = I422ToABGRRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToABGRRow = I422ToABGRRow_NEON; + } + } +#elif defined(HAS_I422TOABGRROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToABGRRow = I422ToABGRRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) { + I422ToABGRRow = I422ToABGRRow_SSSE3; + } + } } +#endif + for (int y = 0; y < height; ++y) { - FastConvertYUVToBGRARow(src_y, src_u, src_v, dst_argb, width); - dst_argb += dst_stride_argb; + I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width); + dst_abgr += dst_stride_abgr; src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } + src_u += src_stride_u; + src_v += src_stride_v; } - EMMS(); return 0; } -// Convert I420 to BGRA. -int I420ToABGR(const uint8* src_y, int src_stride_y, +// Convert I422 to RGBA. +LIBYUV_API +int I422ToRGBA(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, + uint8* dst_rgba, int dst_stride_rgba, int width, int height) { + if (!src_y || !src_u || !src_v || + !dst_rgba || + width <= 0 || height == 0) { + return -1; + } // Negative height means invert the image. if (height < 0) { height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; + dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba; + dst_stride_rgba = -dst_stride_rgba; + } + void (*I422ToRGBARow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToRGBARow_C; +#if defined(HAS_I422TORGBAROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToRGBARow = I422ToRGBARow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToRGBARow = I422ToRGBARow_NEON; + } } +#elif defined(HAS_I422TORGBAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToRGBARow = I422ToRGBARow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) { + I422ToRGBARow = I422ToRGBARow_SSSE3; + } + } + } +#endif + for (int y = 0; y < height; ++y) { - FastConvertYUVToABGRRow(src_y, src_u, src_v, dst_argb, width); - dst_argb += dst_stride_argb; + I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width); + dst_rgba += dst_stride_rgba; src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } + src_u += src_stride_u; + src_v += src_stride_v; } - EMMS(); return 0; } -// Convert I422 to ARGB. -int I422ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, +// Convert ARGB to RGBA. +LIBYUV_API +int ARGBToRGBA(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgba, int dst_stride_rgba, int width, int height) { + if (!src_argb || !dst_rgba || + width <= 0 || height == 0) { + return -1; + } // Negative height means invert the image. if (height < 0) { height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + void (*ARGBToRGBARow)(const uint8* src_argb, uint8* dst_rgba, int pix) = + ARGBToRGBARow_C; +#if defined(HAS_ARGBTORGBAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + IS_ALIGNED(width, 4) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) { + ARGBToRGBARow = ARGBToRGBARow_SSSE3; + } +#endif +#if defined(HAS_ARGBTORGBAROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + ARGBToRGBARow = ARGBToRGBARow_NEON; } +#endif + for (int y = 0; y < height; ++y) { - FastConvertYUVToRGB32Row(src_y, src_u, src_v, dst_argb, width); - dst_argb += dst_stride_argb; - src_y += src_stride_y; - src_u += src_stride_u; - src_v += src_stride_v; + ARGBToRGBARow(src_argb, dst_rgba, width); + src_argb += src_stride_argb; + dst_rgba += dst_stride_rgba; } - // MMX used for FastConvertYUVToRGB32Row requires an emms instruction. - EMMS(); return 0; } -// Convert I444 to ARGB. -int I444ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - // Negative height means invert the image. +// Convert ARGB To RGB24. +LIBYUV_API +int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgb24, int dst_stride_rgb24, + int width, int height) { + if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) { + return -1; + } if (height < 0) { height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = + ARGBToRGB24Row_C; +#if defined(HAS_ARGBTORGB24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + if (width * 3 <= kMaxStride) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3; + } + if (IS_ALIGNED(width, 16) && + IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) { + ARGBToRGB24Row = ARGBToRGB24Row_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + if (width * 3 <= kMaxStride) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON; + } + if (IS_ALIGNED(width, 8)) { + ARGBToRGB24Row = ARGBToRGB24Row_NEON; + } } +#endif + for (int y = 0; y < height; ++y) { - FastConvertYUV444ToRGB32Row(src_y, src_u, src_v, dst_argb, width); - dst_argb += dst_stride_argb; - src_y += src_stride_y; - src_u += src_stride_u; - src_v += src_stride_v; + ARGBToRGB24Row(src_argb, dst_rgb24, width); + src_argb += src_stride_argb; + dst_rgb24 += dst_stride_rgb24; } - // MMX used for FastConvertYUVToRGB32Row requires an emms instruction. - EMMS(); return 0; } -// Convert I400 to ARGB. -int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - // Negative height means invert the image. +// Convert ARGB To RAW. +LIBYUV_API +int ARGBToRAW(const uint8* src_argb, int src_stride_argb, + uint8* dst_raw, int dst_stride_raw, + int width, int height) { + if (!src_argb || !dst_raw || width <= 0 || height == 0) { + return -1; + } if (height < 0) { height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; } + void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) = + ARGBToRAWRow_C; +#if defined(HAS_ARGBTORAWROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + if (width * 3 <= kMaxStride) { + ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3; + } + if (IS_ALIGNED(width, 16) && + IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) { + ARGBToRAWRow = ARGBToRAWRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTORAWROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + if (width * 3 <= kMaxStride) { + ARGBToRAWRow = ARGBToRAWRow_Any_NEON; + } + if (IS_ALIGNED(width, 8)) { + ARGBToRAWRow = ARGBToRAWRow_NEON; + } + } +#endif + for (int y = 0; y < height; ++y) { - FastConvertYToRGB32Row(src_y, dst_argb, width); - dst_argb += dst_stride_argb; - src_y += src_stride_y; + ARGBToRAWRow(src_argb, dst_raw, width); + src_argb += src_stride_argb; + dst_raw += dst_stride_raw; } - // MMX used for FastConvertYUVToRGB32Row requires an emms instruction. - EMMS(); return 0; } -// TODO(fbarchard): 64 bit version -#if defined(WIN32) && !defined(COVERAGE_ENABLED) - -#define HAS_I400TOARGBROW_SSE2 -__declspec(naked) -static void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { - __asm { - mov eax, [esp + 4] // src_y - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // pix - pcmpeqb xmm7, xmm7 // generate mask 0xff000000 - pslld xmm7, 24 - - wloop: - movq xmm0, qword ptr [eax] - lea eax, [eax + 8] - punpcklbw xmm0, xmm0 - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm0 - punpckhwd xmm1, xmm1 - por xmm0, xmm7 - por xmm1, xmm7 - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - ja wloop - ret +// Convert ARGB To RGB565. +LIBYUV_API +int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgb565, int dst_stride_rgb565, + int width, int height) { + if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) { + return -1; } -} - -#define HAS_ABGRTOARGBROW_SSSE3 -__declspec(naked) -static void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, - int pix) { -__asm { - mov eax, [esp + 4] // src_abgr - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // pix - movdqa xmm7, _kShuffleMaskABGRToARGB - - convertloop : - movdqa xmm0, [eax] - lea eax, [eax + 16] - pshufb xmm0, xmm7 - movdqa [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - ja convertloop - ret + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; } -} + void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = + ARGBToRGB565Row_C; +#if defined(HAS_ARGBTORGB565ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + if (width * 2 <= kMaxStride) { + ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2; + } + if (IS_ALIGNED(width, 4)) { + ARGBToRGB565Row = ARGBToRGB565Row_SSE2; + } + } +#endif -#define HAS_BGRATOARGBROW_SSSE3 -__declspec(naked) -static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, - int pix) { -__asm { - mov eax, [esp + 4] // src_bgra - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // pix - movdqa xmm7, _kShuffleMaskBGRAToARGB - - convertloop : - movdqa xmm0, [eax] - lea eax, [eax + 16] - pshufb xmm0, xmm7 - movdqa [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - ja convertloop - ret + for (int y = 0; y < height; ++y) { + ARGBToRGB565Row(src_argb, dst_rgb565, width); + src_argb += src_stride_argb; + dst_rgb565 += dst_stride_rgb565; } + return 0; } +// Convert ARGB To ARGB1555. +LIBYUV_API +int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb1555, int dst_stride_argb1555, + int width, int height) { + if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = + ARGBToARGB1555Row_C; +#if defined(HAS_ARGBTOARGB1555ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + if (width * 2 <= kMaxStride) { + ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2; + } + if (IS_ALIGNED(width, 4)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2; + } + } +#endif -#elif (defined(__x86_64__) || defined(__i386__)) && \ - !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) - -// TODO(yuche): consider moving ARGB related codes to a separate file. -#define HAS_I400TOARGBROW_SSE2 -static void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { - asm volatile( - "pcmpeqb %%xmm7,%%xmm7\n" - "pslld $0x18,%%xmm7\n" -"1:" - "movq (%0),%%xmm0\n" - "lea 0x8(%0),%0\n" - "punpcklbw %%xmm0,%%xmm0\n" - "movdqa %%xmm0,%%xmm1\n" - "punpcklwd %%xmm0,%%xmm0\n" - "punpckhwd %%xmm1,%%xmm1\n" - "por %%xmm7,%%xmm0\n" - "por %%xmm7,%%xmm1\n" - "movdqa %%xmm0,(%1)\n" - "movdqa %%xmm1,0x10(%1)\n" - "lea 0x20(%1),%1\n" - "sub $0x8,%2\n" - "ja 1b\n" - : "+r"(src_y), // %0 - "+r"(dst_argb), // %1 - "+r"(pix) // %2 - : - : "memory" -); + for (int y = 0; y < height; ++y) { + ARGBToARGB1555Row(src_argb, dst_argb1555, width); + src_argb += src_stride_argb; + dst_argb1555 += dst_stride_argb1555; + } + return 0; } -#define HAS_ABGRTOARGBROW_SSSE3 -static void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, - int pix) { - asm volatile( - "movdqa (%3),%%xmm7\n" -"1:" - "movdqa (%0),%%xmm0\n" - "lea 0x10(%0),%0\n" - "pshufb %%xmm7,%%xmm0\n" - "movdqa %%xmm0,(%1)\n" - "lea 0x10(%1),%1\n" - "sub $0x4,%2\n" - "ja 1b\n" - : "+r"(src_abgr), // %0 - "+r"(dst_argb), // %1 - "+r"(pix) // %2 - : "r"(kShuffleMaskABGRToARGB) // %3 - : "memory" -); -} +// Convert ARGB To ARGB4444. +LIBYUV_API +int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb4444, int dst_stride_argb4444, + int width, int height) { + if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = + ARGBToARGB4444Row_C; +#if defined(HAS_ARGBTOARGB4444ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + if (width * 2 <= kMaxStride) { + ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2; + } + if (IS_ALIGNED(width, 4)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2; + } + } +#endif -#define HAS_BGRATOARGBROW_SSSE3 -static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, - int pix) { - asm volatile( - "movdqa (%3),%%xmm7\n" -"1:" - "movdqa (%0),%%xmm0\n" - "lea 0x10(%0),%0\n" - "pshufb %%xmm7,%%xmm0\n" - "movdqa %%xmm0,(%1)\n" - "lea 0x10(%1),%1\n" - "sub $0x4,%2\n" - "ja 1b\n" - : "+r"(src_bgra), // %0 - "+r"(dst_argb), // %1 - "+r"(pix) // %2 - : "r"(kShuffleMaskBGRAToARGB) // %3 - : "memory" -); + for (int y = 0; y < height; ++y) { + ARGBToARGB4444Row(src_argb, dst_argb4444, width); + src_argb += src_stride_argb; + dst_argb4444 += dst_stride_argb4444; + } + return 0; } +// Convert NV12 to RGB565. +// TODO(fbarchard): (Re) Optimize for Neon. +LIBYUV_API +int NV12ToRGB565(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_rgb565, int dst_stride_rgb565, + int width, int height) { + if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; + dst_stride_rgb565 = -dst_stride_rgb565; + } + void (*NV12ToARGBRow)(const uint8* y_buf, + const uint8* uv_buf, + uint8* rgb_buf, + int width) = NV12ToARGBRow_C; +#if defined(HAS_NV12TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width * 4 <= kMaxStride) { + NV12ToARGBRow = NV12ToARGBRow_SSSE3; + } +#endif +#if defined(HAS_NV12TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width * 4 <= kMaxStride) { + NV12ToARGBRow = NV12ToARGBRow_NEON; + } #endif -static void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) { - // Copy a Y to RGB. - for (int x = 0; x < pix; ++x) { - uint8 y = src_y[0]; - dst_argb[2] = dst_argb[1] = dst_argb[0] = y; - dst_argb[3] = 255u; - dst_argb += 4; - ++src_y; + SIMD_ALIGNED(uint8 row[kMaxStride]); + void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = + ARGBToRGB565Row_C; +#if defined(HAS_ARGBTORGB565ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) { + ARGBToRGB565Row = ARGBToRGB565Row_SSE2; } +#endif + + for (int y = 0; y < height; ++y) { + NV12ToARGBRow(src_y, src_uv, row, width); + ARGBToRGB565Row(row, dst_rgb565, width); + dst_rgb565 += dst_stride_rgb565; + src_y += src_stride_y; + if (y & 1) { + src_uv += src_stride_uv; + } + } + return 0; } -// Convert I400 to ARGB. -int I400ToARGB(const uint8* src_y, int src_stride_y, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +// Convert NV21 to RGB565. +LIBYUV_API +int NV21ToRGB565(const uint8* src_y, int src_stride_y, + const uint8* src_vu, int src_stride_vu, + uint8* dst_rgb565, int dst_stride_rgb565, + int width, int height) { + if (!src_y || !src_vu || !dst_rgb565 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. if (height < 0) { height = -height; - src_y = src_y + (height - 1) * src_stride_y; - src_stride_y = -src_stride_y; + dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; + dst_stride_rgb565 = -dst_stride_rgb565; + } + void (*NV21ToARGBRow)(const uint8* y_buf, + const uint8* uv_buf, + uint8* rgb_buf, + int width) = NV21ToARGBRow_C; +#if defined(HAS_NV21TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width * 4 <= kMaxStride) { + NV21ToARGBRow = NV21ToARGBRow_SSSE3; } - void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix); -#if defined(HAS_I400TOARGBROW_SSE2) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && - (width % 8 == 0) && - IS_ALIGNED(src_y, 8) && (src_stride_y % 8 == 0) && - IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { - I400ToARGBRow = I400ToARGBRow_SSE2; - } else #endif - { - I400ToARGBRow = I400ToARGBRow_C; + + SIMD_ALIGNED(uint8 row[kMaxStride]); + void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = + ARGBToRGB565Row_C; +#if defined(HAS_ARGBTORGB565ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) { + ARGBToRGB565Row = ARGBToRGB565Row_SSE2; } +#endif for (int y = 0; y < height; ++y) { - I400ToARGBRow(src_y, dst_argb, width); + NV21ToARGBRow(src_y, src_vu, row, width); + ARGBToRGB565Row(row, dst_rgb565, width); + dst_rgb565 += dst_stride_rgb565; src_y += src_stride_y; - dst_argb += dst_stride_argb; + if (y & 1) { + src_vu += src_stride_vu; + } } return 0; } -static void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) { - for (int x = 0; x < pix; ++x) { - // To support in-place conversion. - uint8 r = src_abgr[0]; - uint8 g = src_abgr[1]; - uint8 b = src_abgr[2]; - uint8 a = src_abgr[3]; - dst_argb[0] = b; - dst_argb[1] = g; - dst_argb[2] = r; - dst_argb[3] = a; - dst_argb += 4; - src_abgr += 4; +LIBYUV_API +void SetPlane(uint8* dst_y, int dst_stride_y, + int width, int height, + uint32 value) { + void (*SetRow)(uint8* dst, uint32 value, int pix) = SetRow8_C; +#if defined(HAS_SETROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && + IS_ALIGNED(width, 16) && + IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + SetRow = SetRow8_NEON; + } +#endif +#if defined(HAS_SETROW_X86) + if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) { + SetRow = SetRow8_X86; + } +#endif + + uint32 v32 = value | (value << 8) | (value << 16) | (value << 24); + // Set plane + for (int y = 0; y < height; ++y) { + SetRow(dst_y, v32, width); + dst_y += dst_stride_y; } } -int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +// Draw a rectangle into I420 +LIBYUV_API +int I420Rect(uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int x, int y, + int width, int height, + int value_y, int value_u, int value_v) { + if (!dst_y || !dst_u || !dst_v || + width <= 0 || height <= 0 || + x < 0 || y < 0 || + value_y < 0 || value_y > 255 || + value_u < 0 || value_u > 255 || + value_v < 0 || value_v > 255) { + return -1; + } + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + uint8* start_y = dst_y + y * dst_stride_y + x; + uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2); + uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2); + + SetPlane(start_y, dst_stride_y, width, height, value_y); + SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u); + SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v); + return 0; +} + +// Draw a rectangle into ARGB +LIBYUV_API +int ARGBRect(uint8* dst_argb, int dst_stride_argb, + int dst_x, int dst_y, + int width, int height, + uint32 value) { + if (!dst_argb || + width <= 0 || height <= 0 || + dst_x < 0 || dst_y < 0) { + return -1; + } + uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; +#if defined(HAS_SETROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16) && + IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + SetRows32_NEON(dst, value, width, dst_stride_argb, height); + return 0; + } +#endif +#if defined(HAS_SETROW_X86) + if (TestCpuFlag(kCpuHasX86)) { + SetRows32_X86(dst, value, width, dst_stride_argb, height); + return 0; + } +#endif + SetRows32_C(dst, value, width, dst_stride_argb, height); + return 0; +} + +// Convert unattentuated ARGB to preattenuated ARGB. +// An unattenutated ARGB alpha blend uses the formula +// p = a * f + (1 - a) * b +// where +// p is output pixel +// f is foreground pixel +// b is background pixel +// a is alpha value from foreground pixel +// An preattenutated ARGB alpha blend uses the formula +// p = f + (1 - a) * b +// where +// f is foreground pixel premultiplied by alpha + +LIBYUV_API +int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } if (height < 0) { height = -height; - src_abgr = src_abgr + (height - 1) * src_stride_abgr; - src_stride_abgr = -src_stride_abgr; - } -void (*ABGRToARGBRow)(const uint8* src_abgr, uint8* dst_argb, int pix); -#if defined(HAS_ABGRTOARGBROW_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && - (width % 4 == 0) && - IS_ALIGNED(src_abgr, 16) && (src_stride_abgr % 16 == 0) && - IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { - ABGRToARGBRow = ABGRToARGBRow_SSSE3; - } else + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb, + int width) = ARGBAttenuateRow_C; +#if defined(HAS_ARGBATTENUATE_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSE2; + } #endif - { - ABGRToARGBRow = ABGRToARGBRow_C; +#if defined(HAS_ARGBATTENUATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; } +#endif for (int y = 0; y < height; ++y) { - ABGRToARGBRow(src_abgr, dst_argb, width); - src_abgr += src_stride_abgr; + ARGBAttenuateRow(src_argb, dst_argb, width); + src_argb += src_stride_argb; dst_argb += dst_stride_argb; } return 0; } -static void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix) { - for (int x = 0; x < pix; ++x) { - // To support in-place conversion. - uint8 a = src_bgra[0]; - uint8 r = src_bgra[1]; - uint8 g = src_bgra[2]; - uint8 b = src_bgra[3]; - dst_argb[0] = b; - dst_argb[1] = g; - dst_argb[2] = r; - dst_argb[3] = a; - dst_argb += 4; - src_bgra += 4; +// Convert preattentuated ARGB to unattenuated ARGB. +LIBYUV_API +int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; } -} - -// Convert BGRA to ARGB. -int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { if (height < 0) { height = -height; - src_bgra = src_bgra + (height - 1) * src_stride_bgra; - src_stride_bgra = -src_stride_bgra; - } - void (*BGRAToARGBRow)(const uint8* src_bgra, uint8* dst_argb, int pix); -#if defined(HAS_BGRATOARGBROW_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && - (width % 4 == 0) && - IS_ALIGNED(src_bgra, 16) && (src_stride_bgra % 16 == 0) && - IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { - BGRAToARGBRow = BGRAToARGBRow_SSSE3; - } else -#endif - { - BGRAToARGBRow = BGRAToARGBRow_C; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; } + void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb, + int width) = ARGBUnattenuateRow_C; +#if defined(HAS_ARGBUNATTENUATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2; + } +#endif for (int y = 0; y < height; ++y) { - BGRAToARGBRow(src_bgra, dst_argb, width); - src_bgra += src_stride_bgra; + ARGBUnattenuateRow(src_argb, dst_argb, width); + src_argb += src_stride_argb; dst_argb += dst_stride_argb; } return 0; } -// Convert ARGB to I400. -int ARGBToI400(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, +// Convert ARGB to Grayed ARGB. +LIBYUV_API +int ARGBGrayTo(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, int width, int height) { + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } if (height < 0) { height = -height; src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } -void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix); -#if defined(HAS_ARGBTOYROW_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && - (width % 4 == 0) && - IS_ALIGNED(src_argb, 16) && (src_stride_argb % 16 == 0) && - IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } else + void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb, + int width) = ARGBGrayRow_C; +#if defined(HAS_ARGBGRAYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBGrayRow = ARGBGrayRow_SSSE3; + } +#endif + + for (int y = 0; y < height; ++y) { + ARGBGrayRow(src_argb, dst_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Make a rectangle of ARGB gray scale. +LIBYUV_API +int ARGBGray(uint8* dst_argb, int dst_stride_argb, + int dst_x, int dst_y, + int width, int height) { + if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { + return -1; + } + void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb, + int width) = ARGBGrayRow_C; +#if defined(HAS_ARGBGRAYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBGrayRow = ARGBGrayRow_SSSE3; + } #endif - { - ARGBToYRow = ARGBToYRow_C; + uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + for (int y = 0; y < height; ++y) { + ARGBGrayRow(dst, dst, width); + dst += dst_stride_argb; } + return 0; +} +// Make a rectangle of ARGB Sepia tone. +LIBYUV_API +int ARGBSepia(uint8* dst_argb, int dst_stride_argb, + int dst_x, int dst_y, int width, int height) { + if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { + return -1; + } + void (*ARGBSepiaRow)(uint8* dst_argb, int width) = ARGBSepiaRow_C; +#if defined(HAS_ARGBSEPIAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBSepiaRow = ARGBSepiaRow_SSSE3; + } +#endif + uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; for (int y = 0; y < height; ++y) { - ARGBToYRow(src_argb, dst_y, width); + ARGBSepiaRow(dst, width); + dst += dst_stride_argb; + } + return 0; +} + +// Apply a 4x3 matrix rotation to each ARGB pixel. +LIBYUV_API +int ARGBColorMatrix(uint8* dst_argb, int dst_stride_argb, + const int8* matrix_argb, + int dst_x, int dst_y, int width, int height) { + if (!dst_argb || !matrix_argb || width <= 0 || height <= 0 || + dst_x < 0 || dst_y < 0) { + return -1; + } + void (*ARGBColorMatrixRow)(uint8* dst_argb, const int8* matrix_argb, + int width) = ARGBColorMatrixRow_C; +#if defined(HAS_ARGBCOLORMATRIXROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3; + } +#endif + uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + for (int y = 0; y < height; ++y) { + ARGBColorMatrixRow(dst, matrix_argb, width); + dst += dst_stride_argb; + } + return 0; +} + +// Apply a color table each ARGB pixel. +// Table contains 256 ARGB values. +LIBYUV_API +int ARGBColorTable(uint8* dst_argb, int dst_stride_argb, + const uint8* table_argb, + int dst_x, int dst_y, int width, int height) { + if (!dst_argb || !table_argb || width <= 0 || height <= 0 || + dst_x < 0 || dst_y < 0) { + return -1; + } + void (*ARGBColorTableRow)(uint8* dst_argb, const uint8* table_argb, + int width) = ARGBColorTableRow_C; +#if defined(HAS_ARGBCOLORTABLEROW_X86) + if (TestCpuFlag(kCpuHasX86)) { + ARGBColorTableRow = ARGBColorTableRow_X86; + } +#endif + uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + for (int y = 0; y < height; ++y) { + ARGBColorTableRow(dst, table_argb, width); + dst += dst_stride_argb; + } + return 0; +} + +// ARGBQuantize is used to posterize art. +// e.g. rgb / qvalue * qvalue + qvalue / 2 +// But the low levels implement efficiently with 3 parameters, and could be +// used for other high level operations. +// The divide is replaces with a multiply by reciprocal fixed point multiply. +// Caveat - although SSE2 saturates, the C function does not and should be used +// with care if doing anything but quantization. +LIBYUV_API +int ARGBQuantize(uint8* dst_argb, int dst_stride_argb, + int scale, int interval_size, int interval_offset, + int dst_x, int dst_y, int width, int height) { + if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 || + interval_size < 1 || interval_size > 255) { + return -1; + } + void (*ARGBQuantizeRow)(uint8* dst_argb, int scale, int interval_size, + int interval_offset, int width) = ARGBQuantizeRow_C; +#if defined(HAS_ARGBQUANTIZEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBQuantizeRow = ARGBQuantizeRow_SSE2; + } +#endif + uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + for (int y = 0; y < height; ++y) { + ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width); + dst += dst_stride_argb; + } + return 0; +} + +// Computes table of cumulative sum for image where the value is the sum +// of all values above and to the left of the entry. Used by ARGBBlur. +LIBYUV_API +int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb, + int32* dst_cumsum, int dst_stride32_cumsum, + int width, int height) { + if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) { + return -1; + } + void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum, + const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C; +#if defined(HAS_CUMULATIVESUMTOAVERAGE_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2; + } +#endif + memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4); // 4 int per pixel. + int32* previous_cumsum = dst_cumsum; + for (int y = 0; y < height; ++y) { + ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width); + previous_cumsum = dst_cumsum; + dst_cumsum += dst_stride32_cumsum; src_argb += src_stride_argb; - dst_y += dst_stride_y; } return 0; } +// Blur ARGB image. +// Caller should allocate CumulativeSum table of width * height * 16 bytes +// aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory +// as the buffer is treated as circular. +LIBYUV_API +int ARGBBlur(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int32* dst_cumsum, int dst_stride32_cumsum, + int width, int height, int radius) { + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum, + const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C; + void (*CumulativeSumToAverage)(const int32* topleft, const int32* botleft, + int width, int area, uint8* dst, int count) = CumulativeSumToAverage_C; +#if defined(HAS_CUMULATIVESUMTOAVERAGE_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2; + CumulativeSumToAverage = CumulativeSumToAverage_SSE2; + } +#endif + // Compute enough CumulativeSum for first row to be blurred. After this + // one row of CumulativeSum is updated at a time. + ARGBComputeCumulativeSum(src_argb, src_stride_argb, + dst_cumsum, dst_stride32_cumsum, + width, radius); -// Convert RAW to ARGB. -int RAWToARGB(const uint8* src_raw, int src_stride_raw, + src_argb = src_argb + radius * src_stride_argb; + int32* cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum]; + + const int32* max_cumsum_bot_row = + &dst_cumsum[(radius * 2 + 2) * dst_stride32_cumsum]; + const int32* cumsum_top_row = &dst_cumsum[0]; + + for (int y = 0; y < height; ++y) { + int top_y = ((y - radius - 1) >= 0) ? (y - radius - 1) : 0; + int bot_y = ((y + radius) < height) ? (y + radius) : (height - 1); + int area = radius * (bot_y - top_y); + + // Increment cumsum_top_row pointer with circular buffer wrap around. + if (top_y) { + cumsum_top_row += dst_stride32_cumsum; + if (cumsum_top_row >= max_cumsum_bot_row) { + cumsum_top_row = dst_cumsum; + } + } + // Increment cumsum_bot_row pointer with circular buffer wrap around and + // then fill in a row of CumulativeSum. + if ((y + radius) < height) { + const int32* prev_cumsum_bot_row = cumsum_bot_row; + cumsum_bot_row += dst_stride32_cumsum; + if (cumsum_bot_row >= max_cumsum_bot_row) { + cumsum_bot_row = dst_cumsum; + } + ComputeCumulativeSumRow(src_argb, cumsum_bot_row, prev_cumsum_bot_row, + width); + src_argb += src_stride_argb; + } + + // Left clipped. + int boxwidth = radius * 4; + int x; + for (x = 0; x < radius + 1; ++x) { + CumulativeSumToAverage(cumsum_top_row, cumsum_bot_row, + boxwidth, area, &dst_argb[x * 4], 1); + area += (bot_y - top_y); + boxwidth += 4; + } + + // Middle unclipped. + int n = (width - 1) - radius - x + 1; + CumulativeSumToAverage(cumsum_top_row, cumsum_bot_row, + boxwidth, area, &dst_argb[x * 4], n); + + // Right clipped. + for (x += n; x <= width - 1; ++x) { + area -= (bot_y - top_y); + boxwidth -= 4; + CumulativeSumToAverage(cumsum_top_row + (x - radius - 1) * 4, + cumsum_bot_row + (x - radius - 1) * 4, + boxwidth, area, &dst_argb[x * 4], 1); + } + dst_argb += dst_stride_argb; + } + return 0; +} + +// Multiply ARGB image by a specified ARGB value. +LIBYUV_API +int ARGBShade(const uint8* src_argb, int src_stride_argb, uint8* dst_argb, int dst_stride_argb, - int width, int height) { + int width, int height, uint32 value) { + if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) { + return -1; + } if (height < 0) { height = -height; - src_raw = src_raw + (height - 1) * src_stride_raw; - src_stride_raw = -src_stride_raw; - } - void (*RAWToARGBRow)(const uint8* src_raw, uint8* dst_argb, int pix); -#if defined(HAS_RAWTOARGBROW_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && - (width % 16 == 0) && - IS_ALIGNED(src_raw, 16) && (src_stride_raw % 16 == 0) && - IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { - RAWToARGBRow = RAWToARGBRow_SSSE3; - } else -#endif - { - RAWToARGBRow = RAWToARGBRow_C; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; } + void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb, + int width, uint32 value) = ARGBShadeRow_C; +#if defined(HAS_ARGBSHADE_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBShadeRow = ARGBShadeRow_SSE2; + } +#endif for (int y = 0; y < height; ++y) { - RAWToARGBRow(src_raw, dst_argb, width); - src_raw += src_stride_raw; + ARGBShadeRow(src_argb, dst_argb, width, value); + src_argb += src_stride_argb; dst_argb += dst_stride_argb; } return 0; } -// Convert BG24 to ARGB. -int BG24ToARGB(const uint8* src_bg24, int src_stride_bg24, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +// Interpolate 2 ARGB images by specified amount (0 to 255). +LIBYUV_API +int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height, int interpolation) { + if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. if (height < 0) { height = -height; - src_bg24 = src_bg24 + (height - 1) * src_stride_bg24; - src_stride_bg24 = -src_stride_bg24; - } - void (*BG24ToARGBRow)(const uint8* src_bg24, uint8* dst_argb, int pix); -#if defined(HAS_BG24TOARGBROW_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && - (width % 16 == 0) && - IS_ALIGNED(src_bg24, 16) && (src_stride_bg24 % 16 == 0) && - IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { - BG24ToARGBRow = BG24ToARGBRow_SSSE3; - } else -#endif - { - BG24ToARGBRow = BG24ToARGBRow_C; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; } - + void (*ARGBInterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = ARGBInterpolateRow_C; +#if defined(HAS_ARGBINTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) && + IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBInterpolateRow = ARGBInterpolateRow_SSSE3; + } +#endif for (int y = 0; y < height; ++y) { - BG24ToARGBRow(src_bg24, dst_argb, width); - src_bg24 += src_stride_bg24; + ARGBInterpolateRow(dst_argb, src_argb0, src_argb1 - src_argb0, + width, interpolation); + src_argb0 += src_stride_argb0; + src_argb1 += src_stride_argb1; dst_argb += dst_stride_argb; } return 0; } +#ifdef __cplusplus +} // extern "C" } // namespace libyuv - +#endif diff --git a/files/source/rotate.cc b/files/source/rotate.cc index 12cdd7e1..cac3fa0b 100644 --- a/files/source/rotate.cc +++ b/files/source/rotate.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -8,49 +8,44 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "libyuv/planar_functions.h" #include "libyuv/rotate.h" -#include "rotate_priv.h" #include "libyuv/cpu_id.h" +#include "libyuv/convert.h" +#include "libyuv/planar_functions.h" +#include "libyuv/row.h" +#ifdef __cplusplus namespace libyuv { +extern "C" { +#endif -#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \ - && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) -#if defined(_MSC_VER) -#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var +#if !defined(YUV_DISABLE_ASM) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) +#if defined(__APPLE__) && defined(__i386__) +#define DECLARE_FUNCTION(name) \ + ".text \n" \ + ".private_extern _" #name " \n" \ + ".align 4,0x90 \n" \ +"_" #name ": \n" +#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__) +#define DECLARE_FUNCTION(name) \ + ".text \n" \ + ".align 4,0x90 \n" \ +"_" #name ": \n" #else -#define TALIGN16(t, var) t var __attribute__((aligned(16))) +#define DECLARE_FUNCTION(name) \ + ".text \n" \ + ".align 4,0x90 \n" \ +#name ": \n" #endif -// Shuffle table for reversing the bytes. -extern "C" TALIGN16(const uint8, kShuffleReverse[16]) = - { 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u }; -// Shuffle table for reversing the bytes of UV channels. -extern "C" TALIGN16(const uint8, kShuffleReverseUV[16]) = - { 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u }; #endif -typedef void (*reverse_uv_func)(const uint8*, uint8*, uint8*, int); -typedef void (*reverse_func)(const uint8*, uint8*, int); -typedef void (*rotate_uv_wx8_func)(const uint8*, int, - uint8*, int, - uint8*, int, int); -typedef void (*rotate_uv_wxh_func)(const uint8*, int, - uint8*, int, - uint8*, int, int, int); -typedef void (*rotate_wx8_func)(const uint8*, int, uint8*, int, int); -typedef void (*rotate_wxh_func)(const uint8*, int, uint8*, int, int, int); - -#if 0 // Need to add rotate_neon.s to the build to enable this -#ifdef __ARM_NEON__ -extern "C" { -void RestoreRegisters_NEON(unsigned long long *restore); -void SaveRegisters_NEON(unsigned long long *store); -#define HAS_REVERSE_LINE_NEON -void ReverseLine_NEON(const uint8* src, uint8* dst, int width); -#define HAS_REVERSE_LINE_UV_NEON -void ReverseLineUV_NEON(const uint8* src, +#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__) +#define HAS_MIRRORROW_NEON +void MirrorRow_NEON(const uint8* src, uint8* dst, int width); +#define HAS_MIRRORROW_UV_NEON +void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width); #define HAS_TRANSPOSE_WX8_NEON @@ -61,16 +56,14 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, uint8* dst_b, int dst_stride_b, int width); -} // extern "C" -#endif -#endif +#endif // defined(__ARM_NEON__) -#if defined(WIN32) && !defined(COVERAGE_ENABLED) +#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) #define HAS_TRANSPOSE_WX8_SSSE3 -__declspec(naked) +__declspec(naked) __declspec(align(16)) static void TransposeWx8_SSSE3(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width) { -__asm { + __asm { push edi push esi push ebp @@ -79,9 +72,11 @@ __asm { mov edx, [esp + 12 + 12] // dst mov esi, [esp + 12 + 16] // dst_stride mov ecx, [esp + 12 + 20] // width - convertloop : + // Read in the data from the source pointer. // First round of bit swap. + align 16 + convertloop: movq xmm0, qword ptr [eax] lea ebp, [eax + 8] movq xmm1, qword ptr [eax + edi] @@ -144,10 +139,10 @@ __asm { movq qword ptr [edx], xmm3 movdqa xmm7, xmm3 palignr xmm7, xmm7, 8 + sub ecx, 8 movq qword ptr [edx + esi], xmm7 lea edx, [edx + 2 * esi] - sub ecx, 8 - ja convertloop + jg convertloop pop ebp pop esi @@ -157,12 +152,12 @@ __asm { } #define HAS_TRANSPOSE_UVWX8_SSE2 -__declspec(naked) +__declspec(naked) __declspec(align(16)) static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, uint8* dst_b, int dst_stride_b, int w) { -__asm { + __asm { push ebx push esi push edi @@ -178,7 +173,9 @@ __asm { and esp, ~15 mov [esp + 16], ecx mov ecx, [ecx + 16 + 28] // w - convertloop : + + align 16 + convertloop: // Read in the data from the source pointer. // First round of bit swap. movdqa xmm0, [eax] @@ -268,12 +265,12 @@ __asm { movlpd qword ptr [edx], xmm3 movhpd qword ptr [ebx], xmm3 punpckhdq xmm0, xmm7 + sub ecx, 8 movlpd qword ptr [edx + esi], xmm0 lea edx, [edx + 2 * esi] movhpd qword ptr [ebx + ebp], xmm0 lea ebx, [ebx + 2 * ebp] - sub ecx, 8 - ja convertloop + jg convertloop mov esp, [esp + 16] pop ebp @@ -283,356 +280,355 @@ __asm { ret } } -#elif (defined(__i386__) || defined(__x86_64__)) && \ - !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) +#elif !defined(YUV_DISABLE_ASM) && (defined(__i386__) || defined(__x86_64__)) #define HAS_TRANSPOSE_WX8_SSSE3 static void TransposeWx8_SSSE3(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width) { - asm volatile( -"1:" - // Read in the data from the source pointer. - // First round of bit swap. - "movq (%0),%%xmm0\n" - "movq (%0,%3),%%xmm1\n" - "lea (%0,%3,2),%0\n" - "punpcklbw %%xmm1,%%xmm0\n" - "movq (%0),%%xmm2\n" - "movdqa %%xmm0,%%xmm1\n" - "palignr $0x8,%%xmm1,%%xmm1\n" - "movq (%0,%3),%%xmm3\n" - "lea (%0,%3,2),%0\n" - "punpcklbw %%xmm3,%%xmm2\n" - "movdqa %%xmm2,%%xmm3\n" - "movq (%0),%%xmm4\n" - "palignr $0x8,%%xmm3,%%xmm3\n" - "movq (%0,%3),%%xmm5\n" - "lea (%0,%3,2),%0\n" - "punpcklbw %%xmm5,%%xmm4\n" - "movdqa %%xmm4,%%xmm5\n" - "movq (%0),%%xmm6\n" - "palignr $0x8,%%xmm5,%%xmm5\n" - "movq (%0,%3),%%xmm7\n" - "lea (%0,%3,2),%0\n" - "punpcklbw %%xmm7,%%xmm6\n" - "neg %3\n" - "movdqa %%xmm6,%%xmm7\n" - "lea 0x8(%0,%3,8),%0\n" - "palignr $0x8,%%xmm7,%%xmm7\n" - "neg %3\n" - // Second round of bit swap. - "punpcklwd %%xmm2,%%xmm0\n" - "punpcklwd %%xmm3,%%xmm1\n" - "movdqa %%xmm0,%%xmm2\n" - "movdqa %%xmm1,%%xmm3\n" - "palignr $0x8,%%xmm2,%%xmm2\n" - "palignr $0x8,%%xmm3,%%xmm3\n" - "punpcklwd %%xmm6,%%xmm4\n" - "punpcklwd %%xmm7,%%xmm5\n" - "movdqa %%xmm4,%%xmm6\n" - "movdqa %%xmm5,%%xmm7\n" - "palignr $0x8,%%xmm6,%%xmm6\n" - "palignr $0x8,%%xmm7,%%xmm7\n" - // Third round of bit swap. - // Write to the destination pointer. - "punpckldq %%xmm4,%%xmm0\n" - "movq %%xmm0,(%1)\n" - "movdqa %%xmm0,%%xmm4\n" - "palignr $0x8,%%xmm4,%%xmm4\n" - "movq %%xmm4,(%1,%4)\n" - "lea (%1,%4,2),%1\n" - "punpckldq %%xmm6,%%xmm2\n" - "movdqa %%xmm2,%%xmm6\n" - "movq %%xmm2,(%1)\n" - "palignr $0x8,%%xmm6,%%xmm6\n" - "punpckldq %%xmm5,%%xmm1\n" - "movq %%xmm6,(%1,%4)\n" - "lea (%1,%4,2),%1\n" - "movdqa %%xmm1,%%xmm5\n" - "movq %%xmm1,(%1)\n" - "palignr $0x8,%%xmm5,%%xmm5\n" - "movq %%xmm5,(%1,%4)\n" - "lea (%1,%4,2),%1\n" - "punpckldq %%xmm7,%%xmm3\n" - "movq %%xmm3,(%1)\n" - "movdqa %%xmm3,%%xmm7\n" - "palignr $0x8,%%xmm7,%%xmm7\n" - "movq %%xmm7,(%1,%4)\n" - "lea (%1,%4,2),%1\n" - "sub $0x8,%2\n" - "ja 1b\n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"(static_cast<intptr_t>(src_stride)), // %3 - "r"(static_cast<intptr_t>(dst_stride)) // %4 - : "memory" -); + asm volatile ( + // Read in the data from the source pointer. + // First round of bit swap. + ".p2align 4 \n" + "1: \n" + "movq (%0),%%xmm0 \n" + "movq (%0,%3),%%xmm1 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "movq (%0),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "palignr $0x8,%%xmm1,%%xmm1 \n" + "movq (%0,%3),%%xmm3 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "movq (%0),%%xmm4 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "movq (%0,%3),%%xmm5 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "movq (%0),%%xmm6 \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq (%0,%3),%%xmm7 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "neg %3 \n" + "movdqa %%xmm6,%%xmm7 \n" + "lea 0x8(%0,%3,8),%0 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "neg %3 \n" + // Second round of bit swap. + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "palignr $0x8,%%xmm2,%%xmm2 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "movdqa %%xmm5,%%xmm7 \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + // Third round of bit swap. + // Write to the destination pointer. + "punpckldq %%xmm4,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqa %%xmm0,%%xmm4 \n" + "palignr $0x8,%%xmm4,%%xmm4 \n" + "movq %%xmm4,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movdqa %%xmm2,%%xmm6 \n" + "movq %%xmm2,(%1) \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movq %%xmm6,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm1,%%xmm5 \n" + "movq %%xmm1,(%1) \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq %%xmm5,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movq %%xmm3,(%1) \n" + "movdqa %%xmm3,%%xmm7 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "sub $0x8,%2 \n" + "movq %%xmm7,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(static_cast<intptr_t>(src_stride)), // %3 + "r"(static_cast<intptr_t>(dst_stride)) // %4 + : "memory", "cc" + #if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + #endif + ); } -#if defined (__i386__) +#if !defined(YUV_DISABLE_ASM) && defined (__i386__) #define HAS_TRANSPOSE_UVWX8_SSE2 extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, uint8* dst_b, int dst_stride_b, int w); - asm( - ".text\n" -#if defined(OSX) - ".globl _TransposeUVWx8_SSE2\n" -"_TransposeUVWx8_SSE2:\n" -#else - ".global TransposeUVWx8_SSE2\n" -"TransposeUVWx8_SSE2:\n" -#endif - "push %ebx\n" - "push %esi\n" - "push %edi\n" - "push %ebp\n" - "mov 0x14(%esp),%eax\n" - "mov 0x18(%esp),%edi\n" - "mov 0x1c(%esp),%edx\n" - "mov 0x20(%esp),%esi\n" - "mov 0x24(%esp),%ebx\n" - "mov 0x28(%esp),%ebp\n" - "mov %esp,%ecx\n" - "sub $0x14,%esp\n" - "and $0xfffffff0,%esp\n" - "mov %ecx,0x10(%esp)\n" - "mov 0x2c(%ecx),%ecx\n" + asm ( + DECLARE_FUNCTION(TransposeUVWx8_SSE2) + "push %ebx \n" + "push %esi \n" + "push %edi \n" + "push %ebp \n" + "mov 0x14(%esp),%eax \n" + "mov 0x18(%esp),%edi \n" + "mov 0x1c(%esp),%edx \n" + "mov 0x20(%esp),%esi \n" + "mov 0x24(%esp),%ebx \n" + "mov 0x28(%esp),%ebp \n" + "mov %esp,%ecx \n" + "sub $0x14,%esp \n" + "and $0xfffffff0,%esp \n" + "mov %ecx,0x10(%esp) \n" + "mov 0x2c(%ecx),%ecx \n" -"1:" - "movdqa (%eax),%xmm0\n" - "movdqa (%eax,%edi,1),%xmm1\n" - "lea (%eax,%edi,2),%eax\n" - "movdqa %xmm0,%xmm7\n" - "punpcklbw %xmm1,%xmm0\n" - "punpckhbw %xmm1,%xmm7\n" - "movdqa %xmm7,%xmm1\n" - "movdqa (%eax),%xmm2\n" - "movdqa (%eax,%edi,1),%xmm3\n" - "lea (%eax,%edi,2),%eax\n" - "movdqa %xmm2,%xmm7\n" - "punpcklbw %xmm3,%xmm2\n" - "punpckhbw %xmm3,%xmm7\n" - "movdqa %xmm7,%xmm3\n" - "movdqa (%eax),%xmm4\n" - "movdqa (%eax,%edi,1),%xmm5\n" - "lea (%eax,%edi,2),%eax\n" - "movdqa %xmm4,%xmm7\n" - "punpcklbw %xmm5,%xmm4\n" - "punpckhbw %xmm5,%xmm7\n" - "movdqa %xmm7,%xmm5\n" - "movdqa (%eax),%xmm6\n" - "movdqa (%eax,%edi,1),%xmm7\n" - "lea (%eax,%edi,2),%eax\n" - "movdqa %xmm5,(%esp)\n" - "neg %edi\n" - "movdqa %xmm6,%xmm5\n" - "punpcklbw %xmm7,%xmm6\n" - "punpckhbw %xmm7,%xmm5\n" - "movdqa %xmm5,%xmm7\n" - "lea 0x10(%eax,%edi,8),%eax\n" - "neg %edi\n" - "movdqa %xmm0,%xmm5\n" - "punpcklwd %xmm2,%xmm0\n" - "punpckhwd %xmm2,%xmm5\n" - "movdqa %xmm5,%xmm2\n" - "movdqa %xmm1,%xmm5\n" - "punpcklwd %xmm3,%xmm1\n" - "punpckhwd %xmm3,%xmm5\n" - "movdqa %xmm5,%xmm3\n" - "movdqa %xmm4,%xmm5\n" - "punpcklwd %xmm6,%xmm4\n" - "punpckhwd %xmm6,%xmm5\n" - "movdqa %xmm5,%xmm6\n" - "movdqa (%esp),%xmm5\n" - "movdqa %xmm6,(%esp)\n" - "movdqa %xmm5,%xmm6\n" - "punpcklwd %xmm7,%xmm5\n" - "punpckhwd %xmm7,%xmm6\n" - "movdqa %xmm6,%xmm7\n" - "movdqa %xmm0,%xmm6\n" - "punpckldq %xmm4,%xmm0\n" - "punpckhdq %xmm4,%xmm6\n" - "movdqa %xmm6,%xmm4\n" - "movdqa (%esp),%xmm6\n" - "movlpd %xmm0,(%edx)\n" - "movhpd %xmm0,(%ebx)\n" - "movlpd %xmm4,(%edx,%esi,1)\n" - "lea (%edx,%esi,2),%edx\n" - "movhpd %xmm4,(%ebx,%ebp,1)\n" - "lea (%ebx,%ebp,2),%ebx\n" - "movdqa %xmm2,%xmm0\n" - "punpckldq %xmm6,%xmm2\n" - "movlpd %xmm2,(%edx)\n" - "movhpd %xmm2,(%ebx)\n" - "punpckhdq %xmm6,%xmm0\n" - "movlpd %xmm0,(%edx,%esi,1)\n" - "lea (%edx,%esi,2),%edx\n" - "movhpd %xmm0,(%ebx,%ebp,1)\n" - "lea (%ebx,%ebp,2),%ebx\n" - "movdqa %xmm1,%xmm0\n" - "punpckldq %xmm5,%xmm1\n" - "movlpd %xmm1,(%edx)\n" - "movhpd %xmm1,(%ebx)\n" - "punpckhdq %xmm5,%xmm0\n" - "movlpd %xmm0,(%edx,%esi,1)\n" - "lea (%edx,%esi,2),%edx\n" - "movhpd %xmm0,(%ebx,%ebp,1)\n" - "lea (%ebx,%ebp,2),%ebx\n" - "movdqa %xmm3,%xmm0\n" - "punpckldq %xmm7,%xmm3\n" - "movlpd %xmm3,(%edx)\n" - "movhpd %xmm3,(%ebx)\n" - "punpckhdq %xmm7,%xmm0\n" - "movlpd %xmm0,(%edx,%esi,1)\n" - "lea (%edx,%esi,2),%edx\n" - "movhpd %xmm0,(%ebx,%ebp,1)\n" - "lea (%ebx,%ebp,2),%ebx\n" - "sub $0x8,%ecx\n" - "ja 1b\n" - "mov 0x10(%esp),%esp\n" - "pop %ebp\n" - "pop %edi\n" - "pop %esi\n" - "pop %ebx\n" - "ret\n" +"1: \n" + "movdqa (%eax),%xmm0 \n" + "movdqa (%eax,%edi,1),%xmm1 \n" + "lea (%eax,%edi,2),%eax \n" + "movdqa %xmm0,%xmm7 \n" + "punpcklbw %xmm1,%xmm0 \n" + "punpckhbw %xmm1,%xmm7 \n" + "movdqa %xmm7,%xmm1 \n" + "movdqa (%eax),%xmm2 \n" + "movdqa (%eax,%edi,1),%xmm3 \n" + "lea (%eax,%edi,2),%eax \n" + "movdqa %xmm2,%xmm7 \n" + "punpcklbw %xmm3,%xmm2 \n" + "punpckhbw %xmm3,%xmm7 \n" + "movdqa %xmm7,%xmm3 \n" + "movdqa (%eax),%xmm4 \n" + "movdqa (%eax,%edi,1),%xmm5 \n" + "lea (%eax,%edi,2),%eax \n" + "movdqa %xmm4,%xmm7 \n" + "punpcklbw %xmm5,%xmm4 \n" + "punpckhbw %xmm5,%xmm7 \n" + "movdqa %xmm7,%xmm5 \n" + "movdqa (%eax),%xmm6 \n" + "movdqa (%eax,%edi,1),%xmm7 \n" + "lea (%eax,%edi,2),%eax \n" + "movdqa %xmm5,(%esp) \n" + "neg %edi \n" + "movdqa %xmm6,%xmm5 \n" + "punpcklbw %xmm7,%xmm6 \n" + "punpckhbw %xmm7,%xmm5 \n" + "movdqa %xmm5,%xmm7 \n" + "lea 0x10(%eax,%edi,8),%eax \n" + "neg %edi \n" + "movdqa %xmm0,%xmm5 \n" + "punpcklwd %xmm2,%xmm0 \n" + "punpckhwd %xmm2,%xmm5 \n" + "movdqa %xmm5,%xmm2 \n" + "movdqa %xmm1,%xmm5 \n" + "punpcklwd %xmm3,%xmm1 \n" + "punpckhwd %xmm3,%xmm5 \n" + "movdqa %xmm5,%xmm3 \n" + "movdqa %xmm4,%xmm5 \n" + "punpcklwd %xmm6,%xmm4 \n" + "punpckhwd %xmm6,%xmm5 \n" + "movdqa %xmm5,%xmm6 \n" + "movdqa (%esp),%xmm5 \n" + "movdqa %xmm6,(%esp) \n" + "movdqa %xmm5,%xmm6 \n" + "punpcklwd %xmm7,%xmm5 \n" + "punpckhwd %xmm7,%xmm6 \n" + "movdqa %xmm6,%xmm7 \n" + "movdqa %xmm0,%xmm6 \n" + "punpckldq %xmm4,%xmm0 \n" + "punpckhdq %xmm4,%xmm6 \n" + "movdqa %xmm6,%xmm4 \n" + "movdqa (%esp),%xmm6 \n" + "movlpd %xmm0,(%edx) \n" + "movhpd %xmm0,(%ebx) \n" + "movlpd %xmm4,(%edx,%esi,1) \n" + "lea (%edx,%esi,2),%edx \n" + "movhpd %xmm4,(%ebx,%ebp,1) \n" + "lea (%ebx,%ebp,2),%ebx \n" + "movdqa %xmm2,%xmm0 \n" + "punpckldq %xmm6,%xmm2 \n" + "movlpd %xmm2,(%edx) \n" + "movhpd %xmm2,(%ebx) \n" + "punpckhdq %xmm6,%xmm0 \n" + "movlpd %xmm0,(%edx,%esi,1) \n" + "lea (%edx,%esi,2),%edx \n" + "movhpd %xmm0,(%ebx,%ebp,1) \n" + "lea (%ebx,%ebp,2),%ebx \n" + "movdqa %xmm1,%xmm0 \n" + "punpckldq %xmm5,%xmm1 \n" + "movlpd %xmm1,(%edx) \n" + "movhpd %xmm1,(%ebx) \n" + "punpckhdq %xmm5,%xmm0 \n" + "movlpd %xmm0,(%edx,%esi,1) \n" + "lea (%edx,%esi,2),%edx \n" + "movhpd %xmm0,(%ebx,%ebp,1) \n" + "lea (%ebx,%ebp,2),%ebx \n" + "movdqa %xmm3,%xmm0 \n" + "punpckldq %xmm7,%xmm3 \n" + "movlpd %xmm3,(%edx) \n" + "movhpd %xmm3,(%ebx) \n" + "punpckhdq %xmm7,%xmm0 \n" + "sub $0x8,%ecx \n" + "movlpd %xmm0,(%edx,%esi,1) \n" + "lea (%edx,%esi,2),%edx \n" + "movhpd %xmm0,(%ebx,%ebp,1) \n" + "lea (%ebx,%ebp,2),%ebx \n" + "jg 1b \n" + "mov 0x10(%esp),%esp \n" + "pop %ebp \n" + "pop %edi \n" + "pop %esi \n" + "pop %ebx \n" + "ret \n" ); -#elif defined (__x86_64__) +#elif !defined(YUV_DISABLE_ASM) && defined(__x86_64__) // 64 bit version has enough registers to do 16x8 to 8x16 at a time. #define HAS_TRANSPOSE_WX8_FAST_SSSE3 static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width) { - asm volatile( -"1:" + asm volatile ( // Read in the data from the source pointer. // First round of bit swap. - "movdqa (%0),%%xmm0\n" - "movdqa (%0,%3),%%xmm1\n" - "lea (%0,%3,2),%0\n" - "movdqa %%xmm0,%%xmm8\n" - "punpcklbw %%xmm1,%%xmm0\n" - "punpckhbw %%xmm1,%%xmm8\n" - "movdqa (%0),%%xmm2\n" - "movdqa %%xmm0,%%xmm1\n" - "movdqa %%xmm8,%%xmm9\n" - "palignr $0x8,%%xmm1,%%xmm1\n" - "palignr $0x8,%%xmm9,%%xmm9\n" - "movdqa (%0,%3),%%xmm3\n" - "lea (%0,%3,2),%0\n" - "movdqa %%xmm2,%%xmm10\n" - "punpcklbw %%xmm3,%%xmm2\n" - "punpckhbw %%xmm3,%%xmm10\n" - "movdqa %%xmm2,%%xmm3\n" - "movdqa %%xmm10,%%xmm11\n" - "movdqa (%0),%%xmm4\n" - "palignr $0x8,%%xmm3,%%xmm3\n" - "palignr $0x8,%%xmm11,%%xmm11\n" - "movdqa (%0,%3),%%xmm5\n" - "lea (%0,%3,2),%0\n" - "movdqa %%xmm4,%%xmm12\n" - "punpcklbw %%xmm5,%%xmm4\n" - "punpckhbw %%xmm5,%%xmm12\n" - "movdqa %%xmm4,%%xmm5\n" - "movdqa %%xmm12,%%xmm13\n" - "movdqa (%0),%%xmm6\n" - "palignr $0x8,%%xmm5,%%xmm5\n" - "palignr $0x8,%%xmm13,%%xmm13\n" - "movdqa (%0,%3),%%xmm7\n" - "lea (%0,%3,2),%0\n" - "movdqa %%xmm6,%%xmm14\n" - "punpcklbw %%xmm7,%%xmm6\n" - "punpckhbw %%xmm7,%%xmm14\n" - "neg %3\n" - "movdqa %%xmm6,%%xmm7\n" - "movdqa %%xmm14,%%xmm15\n" - "lea 0x10(%0,%3,8),%0\n" - "palignr $0x8,%%xmm7,%%xmm7\n" - "palignr $0x8,%%xmm15,%%xmm15\n" - "neg %3\n" + ".p2align 4 \n" +"1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa (%0,%3),%%xmm1 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm0,%%xmm8 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm8 \n" + "movdqa (%0),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm8,%%xmm9 \n" + "palignr $0x8,%%xmm1,%%xmm1 \n" + "palignr $0x8,%%xmm9,%%xmm9 \n" + "movdqa (%0,%3),%%xmm3 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm2,%%xmm10 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "punpckhbw %%xmm3,%%xmm10 \n" + "movdqa %%xmm2,%%xmm3 \n" + "movdqa %%xmm10,%%xmm11 \n" + "movdqa (%0),%%xmm4 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "palignr $0x8,%%xmm11,%%xmm11 \n" + "movdqa (%0,%3),%%xmm5 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm4,%%xmm12 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "punpckhbw %%xmm5,%%xmm12 \n" + "movdqa %%xmm4,%%xmm5 \n" + "movdqa %%xmm12,%%xmm13 \n" + "movdqa (%0),%%xmm6 \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "palignr $0x8,%%xmm13,%%xmm13 \n" + "movdqa (%0,%3),%%xmm7 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm6,%%xmm14 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "punpckhbw %%xmm7,%%xmm14 \n" + "neg %3 \n" + "movdqa %%xmm6,%%xmm7 \n" + "movdqa %%xmm14,%%xmm15 \n" + "lea 0x10(%0,%3,8),%0 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + "neg %3 \n" // Second round of bit swap. - "punpcklwd %%xmm2,%%xmm0\n" - "punpcklwd %%xmm3,%%xmm1\n" - "movdqa %%xmm0,%%xmm2\n" - "movdqa %%xmm1,%%xmm3\n" - "palignr $0x8,%%xmm2,%%xmm2\n" - "palignr $0x8,%%xmm3,%%xmm3\n" - "punpcklwd %%xmm6,%%xmm4\n" - "punpcklwd %%xmm7,%%xmm5\n" - "movdqa %%xmm4,%%xmm6\n" - "movdqa %%xmm5,%%xmm7\n" - "palignr $0x8,%%xmm6,%%xmm6\n" - "palignr $0x8,%%xmm7,%%xmm7\n" - "punpcklwd %%xmm10,%%xmm8\n" - "punpcklwd %%xmm11,%%xmm9\n" - "movdqa %%xmm8,%%xmm10\n" - "movdqa %%xmm9,%%xmm11\n" - "palignr $0x8,%%xmm10,%%xmm10\n" - "palignr $0x8,%%xmm11,%%xmm11\n" - "punpcklwd %%xmm14,%%xmm12\n" - "punpcklwd %%xmm15,%%xmm13\n" - "movdqa %%xmm12,%%xmm14\n" - "movdqa %%xmm13,%%xmm15\n" - "palignr $0x8,%%xmm14,%%xmm14\n" - "palignr $0x8,%%xmm15,%%xmm15\n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "palignr $0x8,%%xmm2,%%xmm2 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "movdqa %%xmm5,%%xmm7 \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "punpcklwd %%xmm10,%%xmm8 \n" + "punpcklwd %%xmm11,%%xmm9 \n" + "movdqa %%xmm8,%%xmm10 \n" + "movdqa %%xmm9,%%xmm11 \n" + "palignr $0x8,%%xmm10,%%xmm10 \n" + "palignr $0x8,%%xmm11,%%xmm11 \n" + "punpcklwd %%xmm14,%%xmm12 \n" + "punpcklwd %%xmm15,%%xmm13 \n" + "movdqa %%xmm12,%%xmm14 \n" + "movdqa %%xmm13,%%xmm15 \n" + "palignr $0x8,%%xmm14,%%xmm14 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" // Third round of bit swap. // Write to the destination pointer. - "punpckldq %%xmm4,%%xmm0\n" - "movq %%xmm0,(%1)\n" - "movdqa %%xmm0,%%xmm4\n" - "palignr $0x8,%%xmm4,%%xmm4\n" - "movq %%xmm4,(%1,%4)\n" - "lea (%1,%4,2),%1\n" - "punpckldq %%xmm6,%%xmm2\n" - "movdqa %%xmm2,%%xmm6\n" - "movq %%xmm2,(%1)\n" - "palignr $0x8,%%xmm6,%%xmm6\n" - "punpckldq %%xmm5,%%xmm1\n" - "movq %%xmm6,(%1,%4)\n" - "lea (%1,%4,2),%1\n" - "movdqa %%xmm1,%%xmm5\n" - "movq %%xmm1,(%1)\n" - "palignr $0x8,%%xmm5,%%xmm5\n" - "movq %%xmm5,(%1,%4)\n" - "lea (%1,%4,2),%1\n" - "punpckldq %%xmm7,%%xmm3\n" - "movq %%xmm3,(%1)\n" - "movdqa %%xmm3,%%xmm7\n" - "palignr $0x8,%%xmm7,%%xmm7\n" - "movq %%xmm7,(%1,%4)\n" - "lea (%1,%4,2),%1\n" - "punpckldq %%xmm12,%%xmm8\n" - "movq %%xmm8,(%1)\n" - "movdqa %%xmm8,%%xmm12\n" - "palignr $0x8,%%xmm12,%%xmm12\n" - "movq %%xmm12,(%1,%4)\n" - "lea (%1,%4,2),%1\n" - "punpckldq %%xmm14,%%xmm10\n" - "movdqa %%xmm10,%%xmm14\n" - "movq %%xmm10,(%1)\n" - "palignr $0x8,%%xmm14,%%xmm14\n" - "punpckldq %%xmm13,%%xmm9\n" - "movq %%xmm14,(%1,%4)\n" - "lea (%1,%4,2),%1\n" - "movdqa %%xmm9,%%xmm13\n" - "movq %%xmm9,(%1)\n" - "palignr $0x8,%%xmm13,%%xmm13\n" - "movq %%xmm13,(%1,%4)\n" - "lea (%1,%4,2),%1\n" - "punpckldq %%xmm15,%%xmm11\n" - "movq %%xmm11,(%1)\n" - "movdqa %%xmm11,%%xmm15\n" - "palignr $0x8,%%xmm15,%%xmm15\n" - "movq %%xmm15,(%1,%4)\n" - "lea (%1,%4,2),%1\n" - "sub $0x10,%2\n" - "ja 1b\n" + "punpckldq %%xmm4,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqa %%xmm0,%%xmm4 \n" + "palignr $0x8,%%xmm4,%%xmm4 \n" + "movq %%xmm4,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movdqa %%xmm2,%%xmm6 \n" + "movq %%xmm2,(%1) \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movq %%xmm6,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm1,%%xmm5 \n" + "movq %%xmm1,(%1) \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq %%xmm5,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movq %%xmm3,(%1) \n" + "movdqa %%xmm3,%%xmm7 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "movq %%xmm7,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm12,%%xmm8 \n" + "movq %%xmm8,(%1) \n" + "movdqa %%xmm8,%%xmm12 \n" + "palignr $0x8,%%xmm12,%%xmm12 \n" + "movq %%xmm12,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm14,%%xmm10 \n" + "movdqa %%xmm10,%%xmm14 \n" + "movq %%xmm10,(%1) \n" + "palignr $0x8,%%xmm14,%%xmm14 \n" + "punpckldq %%xmm13,%%xmm9 \n" + "movq %%xmm14,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm9,%%xmm13 \n" + "movq %%xmm9,(%1) \n" + "palignr $0x8,%%xmm13,%%xmm13 \n" + "movq %%xmm13,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm15,%%xmm11 \n" + "movq %%xmm11,(%1) \n" + "movdqa %%xmm11,%%xmm15 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + "sub $0x10,%2 \n" + "movq %%xmm15,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : "r"(static_cast<intptr_t>(src_stride)), // %3 "r"(static_cast<intptr_t>(dst_stride)) // %4 - : "memory" + : "memory", "cc", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15" ); } @@ -641,98 +637,99 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, uint8* dst_b, int dst_stride_b, int w) { - asm volatile( -"1:" + asm volatile ( // Read in the data from the source pointer. // First round of bit swap. - "movdqa (%0),%%xmm0\n" - "movdqa (%0,%4),%%xmm1\n" - "lea (%0,%4,2),%0\n" - "movdqa %%xmm0,%%xmm8\n" - "punpcklbw %%xmm1,%%xmm0\n" - "punpckhbw %%xmm1,%%xmm8\n" - "movdqa %%xmm8,%%xmm1\n" - "movdqa (%0),%%xmm2\n" - "movdqa (%0,%4),%%xmm3\n" - "lea (%0,%4,2),%0\n" - "movdqa %%xmm2,%%xmm8\n" - "punpcklbw %%xmm3,%%xmm2\n" - "punpckhbw %%xmm3,%%xmm8\n" - "movdqa %%xmm8,%%xmm3\n" - "movdqa (%0),%%xmm4\n" - "movdqa (%0,%4),%%xmm5\n" - "lea (%0,%4,2),%0\n" - "movdqa %%xmm4,%%xmm8\n" - "punpcklbw %%xmm5,%%xmm4\n" - "punpckhbw %%xmm5,%%xmm8\n" - "movdqa %%xmm8,%%xmm5\n" - "movdqa (%0),%%xmm6\n" - "movdqa (%0,%4),%%xmm7\n" - "lea (%0,%4,2),%0\n" - "movdqa %%xmm6,%%xmm8\n" - "punpcklbw %%xmm7,%%xmm6\n" - "neg %4\n" - "lea 0x10(%0,%4,8),%0\n" - "punpckhbw %%xmm7,%%xmm8\n" - "movdqa %%xmm8,%%xmm7\n" - "neg %4\n" + ".p2align 4 \n" +"1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa (%0,%4),%%xmm1 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm0,%%xmm8 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm8 \n" + "movdqa %%xmm8,%%xmm1 \n" + "movdqa (%0),%%xmm2 \n" + "movdqa (%0,%4),%%xmm3 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm2,%%xmm8 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "punpckhbw %%xmm3,%%xmm8 \n" + "movdqa %%xmm8,%%xmm3 \n" + "movdqa (%0),%%xmm4 \n" + "movdqa (%0,%4),%%xmm5 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm4,%%xmm8 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "punpckhbw %%xmm5,%%xmm8 \n" + "movdqa %%xmm8,%%xmm5 \n" + "movdqa (%0),%%xmm6 \n" + "movdqa (%0,%4),%%xmm7 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm6,%%xmm8 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "neg %4 \n" + "lea 0x10(%0,%4,8),%0 \n" + "punpckhbw %%xmm7,%%xmm8 \n" + "movdqa %%xmm8,%%xmm7 \n" + "neg %4 \n" // Second round of bit swap. - "movdqa %%xmm0,%%xmm8\n" - "movdqa %%xmm1,%%xmm9\n" - "punpckhwd %%xmm2,%%xmm8\n" - "punpckhwd %%xmm3,%%xmm9\n" - "punpcklwd %%xmm2,%%xmm0\n" - "punpcklwd %%xmm3,%%xmm1\n" - "movdqa %%xmm8,%%xmm2\n" - "movdqa %%xmm9,%%xmm3\n" - "movdqa %%xmm4,%%xmm8\n" - "movdqa %%xmm5,%%xmm9\n" - "punpckhwd %%xmm6,%%xmm8\n" - "punpckhwd %%xmm7,%%xmm9\n" - "punpcklwd %%xmm6,%%xmm4\n" - "punpcklwd %%xmm7,%%xmm5\n" - "movdqa %%xmm8,%%xmm6\n" - "movdqa %%xmm9,%%xmm7\n" + "movdqa %%xmm0,%%xmm8 \n" + "movdqa %%xmm1,%%xmm9 \n" + "punpckhwd %%xmm2,%%xmm8 \n" + "punpckhwd %%xmm3,%%xmm9 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm8,%%xmm2 \n" + "movdqa %%xmm9,%%xmm3 \n" + "movdqa %%xmm4,%%xmm8 \n" + "movdqa %%xmm5,%%xmm9 \n" + "punpckhwd %%xmm6,%%xmm8 \n" + "punpckhwd %%xmm7,%%xmm9 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm8,%%xmm6 \n" + "movdqa %%xmm9,%%xmm7 \n" // Third round of bit swap. // Write to the destination pointer. - "movdqa %%xmm0,%%xmm8\n" - "punpckldq %%xmm4,%%xmm0\n" - "movlpd %%xmm0,(%1)\n" // Write back U channel - "movhpd %%xmm0,(%2)\n" // Write back V channel - "punpckhdq %%xmm4,%%xmm8\n" - "movlpd %%xmm8,(%1,%5)\n" - "lea (%1,%5,2),%1\n" - "movhpd %%xmm8,(%2,%6)\n" - "lea (%2,%6,2),%2\n" - "movdqa %%xmm2,%%xmm8\n" - "punpckldq %%xmm6,%%xmm2\n" - "movlpd %%xmm2,(%1)\n" - "movhpd %%xmm2,(%2)\n" - "punpckhdq %%xmm6,%%xmm8\n" - "movlpd %%xmm8,(%1,%5)\n" - "lea (%1,%5,2),%1\n" - "movhpd %%xmm8,(%2,%6)\n" - "lea (%2,%6,2),%2\n" - "movdqa %%xmm1,%%xmm8\n" - "punpckldq %%xmm5,%%xmm1\n" - "movlpd %%xmm1,(%1)\n" - "movhpd %%xmm1,(%2)\n" - "punpckhdq %%xmm5,%%xmm8\n" - "movlpd %%xmm8,(%1,%5)\n" - "lea (%1,%5,2),%1\n" - "movhpd %%xmm8,(%2,%6)\n" - "lea (%2,%6,2),%2\n" - "movdqa %%xmm3,%%xmm8\n" - "punpckldq %%xmm7,%%xmm3\n" - "movlpd %%xmm3,(%1)\n" - "movhpd %%xmm3,(%2)\n" - "punpckhdq %%xmm7,%%xmm8\n" - "movlpd %%xmm8,(%1,%5)\n" - "lea (%1,%5,2),%1\n" - "movhpd %%xmm8,(%2,%6)\n" - "lea (%2,%6,2),%2\n" - "sub $0x8,%3\n" - "ja 1b\n" + "movdqa %%xmm0,%%xmm8 \n" + "punpckldq %%xmm4,%%xmm0 \n" + "movlpd %%xmm0,(%1) \n" // Write back U channel + "movhpd %%xmm0,(%2) \n" // Write back V channel + "punpckhdq %%xmm4,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm2,%%xmm8 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movlpd %%xmm2,(%1) \n" + "movhpd %%xmm2,(%2) \n" + "punpckhdq %%xmm6,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm1,%%xmm8 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movlpd %%xmm1,(%1) \n" + "movhpd %%xmm1,(%2) \n" + "punpckhdq %%xmm5,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm3,%%xmm8 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movlpd %%xmm3,(%1) \n" + "movhpd %%xmm3,(%2) \n" + "punpckhdq %%xmm7,%%xmm8 \n" + "sub $0x8,%3 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst_a), // %1 "+r"(dst_b), // %2 @@ -740,7 +737,9 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, : "r"(static_cast<intptr_t>(src_stride)), // %4 "r"(static_cast<intptr_t>(dst_stride_a)), // %5 "r"(static_cast<intptr_t>(dst_stride_b)) // %6 - : "memory" + : "memory", "cc", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9" ); } #endif @@ -748,9 +747,8 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, static void TransposeWx8_C(const uint8* src, int src_stride, uint8* dst, int dst_stride, - int w) { - int i; - for (i = 0; i < w; ++i) { + int width) { + for (int i = 0; i < width; ++i) { dst[0] = src[0 * src_stride]; dst[1] = src[1 * src_stride]; dst[2] = src[2 * src_stride]; @@ -767,184 +765,143 @@ static void TransposeWx8_C(const uint8* src, int src_stride, static void TransposeWxH_C(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width, int height) { - int i, j; - for (i = 0; i < width; ++i) - for (j = 0; j < height; ++j) + for (int i = 0; i < width; ++i) { + for (int j = 0; j < height; ++j) { dst[i * dst_stride + j] = src[j * src_stride + i]; + } + } } +LIBYUV_API void TransposePlane(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width, int height) { - int i = height; - rotate_wx8_func TransposeWx8; - rotate_wxh_func TransposeWxH; - + void (*TransposeWx8)(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width) = TransposeWx8_C; #if defined(HAS_TRANSPOSE_WX8_NEON) - if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) && - (width % 8 == 0) && - IS_ALIGNED(src, 8) && (src_stride % 8 == 0) && - IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) { + if (TestCpuFlag(kCpuHasNEON)) { TransposeWx8 = TransposeWx8_NEON; - TransposeWxH = TransposeWxH_C; - } else -#endif -#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && - (width % 16 == 0) && - IS_ALIGNED(src, 16) && (src_stride % 16 == 0) && - IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) { - TransposeWx8 = TransposeWx8_FAST_SSSE3; - TransposeWxH = TransposeWxH_C; - } else + } #endif #if defined(HAS_TRANSPOSE_WX8_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && - (width % 8 == 0) && - IS_ALIGNED(src, 8) && (src_stride % 8 == 0) && - IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) { + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { TransposeWx8 = TransposeWx8_SSSE3; - TransposeWxH = TransposeWxH_C; - } else + } #endif - { - TransposeWx8 = TransposeWx8_C; - TransposeWxH = TransposeWxH_C; +#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + IS_ALIGNED(width, 16) && + IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) { + TransposeWx8 = TransposeWx8_FAST_SSSE3; } +#endif - // work across the source in 8x8 tiles + // Work across the source in 8x8 tiles + int i = height; while (i >= 8) { TransposeWx8(src, src_stride, dst, dst_stride, width); - - src += 8 * src_stride; // go down 8 rows - dst += 8; // move over 8 columns - i -= 8; + src += 8 * src_stride; // Go down 8 rows. + dst += 8; // Move over 8 columns. + i -= 8; } - TransposeWxH(src, src_stride, dst, dst_stride, width, i); + TransposeWxH_C(src, src_stride, dst, dst_stride, width, i); } +LIBYUV_API void RotatePlane90(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width, int height) { // Rotate by 90 is a transpose with the source read - // from bottom to top. So set the source pointer to the end + // from bottom to top. So set the source pointer to the end // of the buffer and flip the sign of the source stride. src += src_stride * (height - 1); src_stride = -src_stride; - TransposePlane(src, src_stride, dst, dst_stride, width, height); } +LIBYUV_API void RotatePlane270(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width, int height) { // Rotate by 270 is a transpose with the destination written - // from bottom to top. So set the destination pointer to the end + // from bottom to top. So set the destination pointer to the end // of the buffer and flip the sign of the destination stride. dst += dst_stride * (width - 1); dst_stride = -dst_stride; - TransposePlane(src, src_stride, dst, dst_stride, width, height); } -static void ReverseLine_C(const uint8* src, uint8* dst, int width) { - int i; - src += width - 1; - for (i = 0; i < width; ++i) { - dst[i] = src[0]; - --src; - } -} - -#if defined(WIN32) && !defined(COVERAGE_ENABLED) -#define HAS_REVERSE_LINE_SSSE3 -__declspec(naked) -static void ReverseLine_SSSE3(const uint8* src, uint8* dst, int width) { -__asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // width - movdqa xmm7, _kShuffleReverse - lea eax, [eax + ecx - 16] - convertloop : - movdqa xmm0, [eax] - lea eax, [eax - 16] - pshufb xmm0, xmm7 - movdqa [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - ja convertloop - ret - } -} - -#elif (defined(__i386__) || defined(__x86_64__)) && \ - !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) -#define HAS_REVERSE_LINE_SSSE3 -static void ReverseLine_SSSE3(const uint8* src, uint8* dst, int width) { - intptr_t temp_width = static_cast<intptr_t>(width); - asm volatile( - "movdqa (%3),%%xmm7\n" - "lea -0x10(%0,%2,1),%0\n" -"1:" - "movdqa (%0),%%xmm0\n" - "lea -0x10(%0),%0\n" - "pshufb %%xmm7,%%xmm0\n" - "movdqa %%xmm0,(%1)\n" - "lea 0x10(%1),%1\n" - "sub $0x10,%2\n" - "ja 1b\n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : "r"(kShuffleReverse) // %3 - : "memory" -); -} -#endif - +LIBYUV_API void RotatePlane180(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width, int height) { - int i; - reverse_func ReverseLine; - -#if defined(HAS_REVERSE_LINE_NEON) - if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) && - (width % 16 == 0) && - IS_ALIGNED(src, 16) && (src_stride % 16 == 0) && - IS_ALIGNED(dst, 16) && (dst_stride % 16 == 0)) { - ReverseLine = ReverseLine_NEON; - } else + void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C; +#if defined(HAS_MIRRORROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MirrorRow = MirrorRow_NEON; + } #endif -#if defined(HAS_REVERSE_LINE_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && - (width % 16 == 0) && - IS_ALIGNED(src, 16) && (src_stride % 16 == 0) && - IS_ALIGNED(dst, 16) && (dst_stride % 16 == 0)) { - ReverseLine = ReverseLine_SSSE3; - } else +#if defined(HAS_MIRRORROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(width, 16) && + IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { + MirrorRow = MirrorRow_SSE2; + } #endif - { - ReverseLine = ReverseLine_C; +#if defined(HAS_MIRRORROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + IS_ALIGNED(width, 16) && + IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { + MirrorRow = MirrorRow_SSSE3; } - // Rotate by 180 is a mirror and vertical flip - src += src_stride * (height - 1); - - for (i = 0; i < height; ++i) { - ReverseLine(src, dst, width); - src -= src_stride; +#endif + void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) { + CopyRow = CopyRow_NEON; + } +#endif +#if defined(HAS_COPYROW_X86) + if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) { + CopyRow = CopyRow_X86; + } +#endif +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) && + IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { + CopyRow = CopyRow_SSE2; + } +#endif + if (width > kMaxStride) { + return; + } + // Swap first and last row and mirror the content. Uses a temporary row. + SIMD_ALIGNED(uint8 row[kMaxStride]); + const uint8* src_bot = src + src_stride * (height - 1); + uint8* dst_bot = dst + dst_stride * (height - 1); + int half_height = (height + 1) >> 1; + // Odd height will harmlessly mirror the middle row twice. + for (int y = 0; y < half_height; ++y) { + MirrorRow(src, row, width); // Mirror first row into a buffer + src += src_stride; + MirrorRow(src_bot, dst, width); // Mirror last row into first row dst += dst_stride; + CopyRow(row, dst_bot, width); // Copy first mirrored row into last + src_bot -= src_stride; + dst_bot -= dst_stride; } } static void TransposeUVWx8_C(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, uint8* dst_b, int dst_stride_b, - int w) { - int i; - for (i = 0; i < w; ++i) { + int width) { + for (int i = 0; i < width; ++i) { dst_a[0] = src[0 * src_stride + 0]; dst_b[0] = src[0 * src_stride + 1]; dst_a[1] = src[1 * src_stride + 0]; @@ -970,71 +927,55 @@ static void TransposeUVWx8_C(const uint8* src, int src_stride, static void TransposeUVWxH_C(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, uint8* dst_b, int dst_stride_b, - int w, int h) { - int i, j; - for (i = 0; i < w * 2; i += 2) - for (j = 0; j < h; ++j) { + int width, int height) { + for (int i = 0; i < width * 2; i += 2) + for (int j = 0; j < height; ++j) { dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)]; dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1]; } } +LIBYUV_API void TransposeUV(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, uint8* dst_b, int dst_stride_b, int width, int height) { - int i = height; - rotate_uv_wx8_func TransposeWx8; - rotate_uv_wxh_func TransposeWxH; - + void (*TransposeUVWx8)(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width) = TransposeUVWx8_C; #if defined(HAS_TRANSPOSE_UVWX8_NEON) - unsigned long long store_reg[8]; - if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON)) { - SaveRegisters_NEON(store_reg); - TransposeWx8 = TransposeUVWx8_NEON; - TransposeWxH = TransposeUVWxH_C; - } else -#endif -#if defined(HAS_TRANSPOSE_UVWX8_SSE2) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && - (width % 8 == 0) && - IS_ALIGNED(src, 16) && (src_stride % 16 == 0) && - IS_ALIGNED(dst_a, 8) && (dst_stride_a % 8 == 0) && - IS_ALIGNED(dst_b, 8) && (dst_stride_b % 8 == 0)) { - TransposeWx8 = TransposeUVWx8_SSE2; - TransposeWxH = TransposeUVWxH_C; - } else -#endif - { - TransposeWx8 = TransposeUVWx8_C; - TransposeWxH = TransposeUVWxH_C; + if (TestCpuFlag(kCpuHasNEON)) { + TransposeUVWx8 = TransposeUVWx8_NEON; + } +#elif defined(HAS_TRANSPOSE_UVWX8_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(width, 8) && + IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) { + TransposeUVWx8 = TransposeUVWx8_SSE2; } +#endif - // work through the source in 8x8 tiles + // Work through the source in 8x8 tiles. + int i = height; while (i >= 8) { - TransposeWx8(src, src_stride, - dst_a, dst_stride_a, - dst_b, dst_stride_b, - width); - - src += 8 * src_stride; // go down 8 rows - dst_a += 8; // move over 8 columns - dst_b += 8; // move over 8 columns - i -= 8; + TransposeUVWx8(src, src_stride, + dst_a, dst_stride_a, + dst_b, dst_stride_b, + width); + src += 8 * src_stride; // Go down 8 rows. + dst_a += 8; // Move over 8 columns. + dst_b += 8; // Move over 8 columns. + i -= 8; } - TransposeWxH(src, src_stride, - dst_a, dst_stride_a, - dst_b, dst_stride_b, - width, i); - -#if defined(HAS_TRANSPOSE_UVWX8_NEON) - if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON)) { - RestoreRegisters_NEON(store_reg); - } -#endif + TransposeUVWxH_C(src, src_stride, + dst_a, dst_stride_a, + dst_b, dst_stride_b, + width, i); } +LIBYUV_API void RotateUV90(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, uint8* dst_b, int dst_stride_b, @@ -1048,6 +989,7 @@ void RotateUV90(const uint8* src, int src_stride, width, height); } +LIBYUV_API void RotateUV270(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, uint8* dst_b, int dst_stride_b, @@ -1063,119 +1005,38 @@ void RotateUV270(const uint8* src, int src_stride, width, height); } -#if defined(WIN32) && !defined(COVERAGE_ENABLED) -#define HAS_REVERSE_LINE_UV_SSSE3 -__declspec(naked) -void ReverseLineUV_SSSE3(const uint8* src, - uint8* dst_a, uint8* dst_b, - int width) { -__asm { - push edi - mov eax, [esp + 4 + 4] // src - mov edx, [esp + 4 + 8] // dst_a - mov edi, [esp + 4 + 12] // dst_b - mov ecx, [esp + 4 + 16] // width - movdqa xmm7, _kShuffleReverseUV - lea eax, [eax + ecx * 2 - 16] - - convertloop : - movdqa xmm0, [eax] - lea eax, [eax - 16] - pshufb xmm0, xmm7 - movlpd qword ptr [edx], xmm0 - lea edx, [edx + 8] - movhpd qword ptr [edi], xmm0 - lea edi, [edi + 8] - sub ecx, 8 - ja convertloop - pop edi - ret - } -} - -#elif (defined(__i386__) || defined(__x86_64__)) && \ - !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) -#define HAS_REVERSE_LINE_UV_SSSE3 -void ReverseLineUV_SSSE3(const uint8* src, - uint8* dst_a, uint8* dst_b, - int width) { - intptr_t temp_width = static_cast<intptr_t>(width); - asm volatile( - "movdqa (%4),%%xmm7\n" - "lea -0x10(%0,%3,2),%0\n" -"1:" - "movdqa (%0),%%xmm0\n" - "lea -0x10(%0),%0\n" - "pshufb %%xmm7,%%xmm0\n" - "movlpd %%xmm0,(%1)\n" - "lea 0x8(%1),%1\n" - "movhpd %%xmm0,(%2)\n" - "lea 0x8(%2),%2\n" - "sub $0x8,%3\n" - "ja 1b\n" - : "+r"(src), // %0 - "+r"(dst_a), // %1 - "+r"(dst_b), // %2 - "+r"(temp_width) // %3 - : "r"(kShuffleReverseUV) // %4 - : "memory" -); -} -#endif - -static void ReverseLineUV_C(const uint8* src, - uint8* dst_a, uint8* dst_b, - int width) { - int i; - src += width << 1; - for (i = 0; i < width; ++i) { - src -= 2; - dst_a[i] = src[0]; - dst_b[i] = src[1]; - } -} - +// Rotate 180 is a horizontal and vertical flip. +LIBYUV_API void RotateUV180(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, uint8* dst_b, int dst_stride_b, int width, int height) { - int i; - reverse_uv_func ReverseLine; - -#if defined(HAS_REVERSE_LINE_UV_NEON) - if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) && - (width % 16 == 0) && - IS_ALIGNED(src, 16) && (src_stride % 16 == 0) && - IS_ALIGNED(dst_a, 8) && (dst_stride_a % 8 == 0) && - IS_ALIGNED(dst_b, 8) && (dst_stride_b % 8 == 0) ) { - ReverseLine = ReverseLineUV_NEON; - } else -#endif -#if defined(HAS_REVERSE_LINE_UV_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && - (width % 16 == 0) && - IS_ALIGNED(src, 16) && (src_stride % 16 == 0) && - IS_ALIGNED(dst_a, 8) && (dst_stride_a % 8 == 0) && - IS_ALIGNED(dst_b, 8) && (dst_stride_b % 8 == 0) ) { - ReverseLine = ReverseLineUV_SSSE3; - } else -#endif - { - ReverseLine = ReverseLineUV_C; + void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) = + MirrorRowUV_C; +#if defined(HAS_MIRRORROW_UV_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MirrorRowUV = MirrorRowUV_NEON; } +#elif defined(HAS_MIRRORROW_UV_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + IS_ALIGNED(width, 16) && + IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) { + MirrorRowUV = MirrorRowUV_SSSE3; + } +#endif dst_a += dst_stride_a * (height - 1); dst_b += dst_stride_b * (height - 1); - for (i = 0; i < height; ++i) { - ReverseLine(src, dst_a, dst_b, width); - - src += src_stride; // down one line at a time - dst_a -= dst_stride_a; // nominally up one line at a time - dst_b -= dst_stride_b; // nominally up one line at a time + for (int i = 0; i < height; ++i) { + MirrorRowUV(src, dst_a, dst_b, width); + src += src_stride; + dst_a -= dst_stride_a; + dst_b -= dst_stride_b; } } +LIBYUV_API int I420Rotate(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, @@ -1184,6 +1045,10 @@ int I420Rotate(const uint8* src_y, int src_stride_y, uint8* dst_v, int dst_stride_v, int width, int height, RotationMode mode) { + if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || + !dst_y || !dst_u || !dst_v) { + return -1; + } int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; @@ -1248,6 +1113,7 @@ int I420Rotate(const uint8* src_y, int src_stride_y, return -1; } +LIBYUV_API int NV12ToI420Rotate(const uint8* src_y, int src_stride_y, const uint8* src_uv, int src_stride_uv, uint8* dst_y, int dst_stride_y, @@ -1255,6 +1121,10 @@ int NV12ToI420Rotate(const uint8* src_y, int src_stride_y, uint8* dst_v, int dst_stride_v, int width, int height, RotationMode mode) { + if (!src_y || !src_uv || width <= 0 || height == 0 || + !dst_y || !dst_u || !dst_v) { + return -1; + } int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; @@ -1271,7 +1141,8 @@ int NV12ToI420Rotate(const uint8* src_y, int src_stride_y, switch (mode) { case kRotate0: // copy frame - return NV12ToI420(src_y, src_uv, src_stride_y, + return NV12ToI420(src_y, src_stride_y, + src_uv, src_stride_uv, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, @@ -1309,4 +1180,7 @@ int NV12ToI420Rotate(const uint8* src_y, int src_stride_y, return -1; } +#ifdef __cplusplus +} // extern "C" } // namespace libyuv +#endif diff --git a/files/source/rotate_argb.cc b/files/source/rotate_argb.cc new file mode 100644 index 00000000..9c994467 --- /dev/null +++ b/files/source/rotate_argb.cc @@ -0,0 +1,175 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/rotate.h" + +#include "libyuv/cpu_id.h" +#include "libyuv/convert.h" +#include "libyuv/planar_functions.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// ARGBScale has a function to copy pixels to a row, striding each source +// pixel by a constant. +#if !defined(YUV_DISABLE_ASM) && (defined(_M_IX86) || \ + defined(__x86_64__) || defined(__i386__)) +#define HAS_SCALEARGBROWDOWNEVEN_SSE2 +void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride, + int src_stepx, + uint8* dst_ptr, int dst_width); +#endif +void ScaleARGBRowDownEven_C(const uint8* src_ptr, int, + int src_stepx, + uint8* dst_ptr, int dst_width); + +static void ARGBTranspose(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height) { + void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride, + int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C; +#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(height, 4) && // width of dest. + IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2; + } +#endif + + int src_pixel_step = src_stride / 4; + for (int i = 0; i < width; ++i) { // column of source to row of dest. + ScaleARGBRowDownEven(src, 0, src_pixel_step, dst, height); + dst += dst_stride; + src += 4; + } +} + +void ARGBRotate90(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height) { + // Rotate by 90 is a ARGBTranspose with the source read + // from bottom to top. So set the source pointer to the end + // of the buffer and flip the sign of the source stride. + src += src_stride * (height - 1); + src_stride = -src_stride; + ARGBTranspose(src, src_stride, dst, dst_stride, width, height); +} + +void ARGBRotate270(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height) { + // Rotate by 270 is a ARGBTranspose with the destination written + // from bottom to top. So set the destination pointer to the end + // of the buffer and flip the sign of the destination stride. + dst += dst_stride * (width - 1); + dst_stride = -dst_stride; + ARGBTranspose(src, src_stride, dst, dst_stride, width, height); +} + +void ARGBRotate180(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height) { + void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) = + ARGBMirrorRow_C; +#if defined(HAS_ARGBMIRRORROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { + ARGBMirrorRow = ARGBMirrorRow_SSSE3; + } +#endif + void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width * 4, 64)) { + CopyRow = CopyRow_NEON; + } +#endif +#if defined(HAS_COPYROW_X86) + if (TestCpuFlag(kCpuHasX86)) { + CopyRow = CopyRow_X86; + } +#endif +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width * 4, 32) && + IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { + CopyRow = CopyRow_SSE2; + } +#endif + if (width * 4 > kMaxStride) { + return; + } + // Swap first and last row and mirror the content. Uses a temporary row. + SIMD_ALIGNED(uint8 row[kMaxStride]); + const uint8* src_bot = src + src_stride * (height - 1); + uint8* dst_bot = dst + dst_stride * (height - 1); + int half_height = (height + 1) >> 1; + // Odd height will harmlessly mirror the middle row twice. + for (int y = 0; y < half_height; ++y) { + ARGBMirrorRow(src, row, width); // Mirror first row into a buffer + src += src_stride; + ARGBMirrorRow(src_bot, dst, width); // Mirror last row into first row + dst += dst_stride; + CopyRow(row, dst_bot, width * 4); // Copy first mirrored row into last + src_bot -= src_stride; + dst_bot -= dst_stride; + } +} + +LIBYUV_API +int ARGBRotate(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height, + RotationMode mode) { + if (!src_argb || width <= 0 || height == 0 || !dst_argb) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + + switch (mode) { + case kRotate0: + // copy frame + return ARGBCopy(src_argb, src_stride_argb, + dst_argb, dst_stride_argb, + width, height); + case kRotate90: + ARGBRotate90(src_argb, src_stride_argb, + dst_argb, dst_stride_argb, + width, height); + return 0; + case kRotate270: + ARGBRotate270(src_argb, src_stride_argb, + dst_argb, dst_stride_argb, + width, height); + return 0; + case kRotate180: + ARGBRotate180(src_argb, src_stride_argb, + dst_argb, dst_stride_argb, + width, height); + return 0; + default: + break; + } + return -1; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/files/source/rotate_neon.cc b/files/source/rotate_neon.cc new file mode 100644 index 00000000..49b30032 --- /dev/null +++ b/files/source/rotate_neon.cc @@ -0,0 +1,406 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__) + +static const uvec8 kVTbl4x4Transpose = + { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; + +void TransposeWx8_NEON(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width) { + asm volatile ( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %4, #8 \n" + + // handle 8x8 blocks. this should be the majority of the plane + ".p2align 4 \n" + "1: \n" + "mov r9, %0 \n" + + "vld1.8 {d0}, [r9], %1 \n" + "vld1.8 {d1}, [r9], %1 \n" + "vld1.8 {d2}, [r9], %1 \n" + "vld1.8 {d3}, [r9], %1 \n" + "vld1.8 {d4}, [r9], %1 \n" + "vld1.8 {d5}, [r9], %1 \n" + "vld1.8 {d6}, [r9], %1 \n" + "vld1.8 {d7}, [r9] \n" + + "vtrn.8 d1, d0 \n" + "vtrn.8 d3, d2 \n" + "vtrn.8 d5, d4 \n" + "vtrn.8 d7, d6 \n" + + "vtrn.16 d1, d3 \n" + "vtrn.16 d0, d2 \n" + "vtrn.16 d5, d7 \n" + "vtrn.16 d4, d6 \n" + + "vtrn.32 d1, d5 \n" + "vtrn.32 d0, d4 \n" + "vtrn.32 d3, d7 \n" + "vtrn.32 d2, d6 \n" + + "vrev16.8 q0, q0 \n" + "vrev16.8 q1, q1 \n" + "vrev16.8 q2, q2 \n" + "vrev16.8 q3, q3 \n" + + "mov r9, %2 \n" + + "vst1.8 {d1}, [r9], %3 \n" + "vst1.8 {d0}, [r9], %3 \n" + "vst1.8 {d3}, [r9], %3 \n" + "vst1.8 {d2}, [r9], %3 \n" + "vst1.8 {d5}, [r9], %3 \n" + "vst1.8 {d4}, [r9], %3 \n" + "vst1.8 {d7}, [r9], %3 \n" + "vst1.8 {d6}, [r9] \n" + + "add %0, #8 \n" // src += 8 + "add %2, %2, %3, lsl #3 \n" // dst += 8 * dst_stride + "subs %4, #8 \n" // w -= 8 + "bge 1b \n" + + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %4, #8 \n" + "beq 4f \n" + + // some residual, so between 1 and 7 lines left to transpose + "cmp %4, #2 \n" + "blt 3f \n" + + "cmp %4, #4 \n" + "blt 2f \n" + + // 4x8 block + "mov r9, %0 \n" + "vld1.32 {d0[0]}, [r9], %1 \n" + "vld1.32 {d0[1]}, [r9], %1 \n" + "vld1.32 {d1[0]}, [r9], %1 \n" + "vld1.32 {d1[1]}, [r9], %1 \n" + "vld1.32 {d2[0]}, [r9], %1 \n" + "vld1.32 {d2[1]}, [r9], %1 \n" + "vld1.32 {d3[0]}, [r9], %1 \n" + "vld1.32 {d3[1]}, [r9] \n" + + "mov r9, %2 \n" + + "vld1.8 {q3}, [%5] \n" + + "vtbl.8 d4, {d0, d1}, d6 \n" + "vtbl.8 d5, {d0, d1}, d7 \n" + "vtbl.8 d0, {d2, d3}, d6 \n" + "vtbl.8 d1, {d2, d3}, d7 \n" + + // TODO: rework shuffle above to write + // out with 4 instead of 8 writes + "vst1.32 {d4[0]}, [r9], %3 \n" + "vst1.32 {d4[1]}, [r9], %3 \n" + "vst1.32 {d5[0]}, [r9], %3 \n" + "vst1.32 {d5[1]}, [r9] \n" + + "add r9, %2, #4 \n" + "vst1.32 {d0[0]}, [r9], %3 \n" + "vst1.32 {d0[1]}, [r9], %3 \n" + "vst1.32 {d1[0]}, [r9], %3 \n" + "vst1.32 {d1[1]}, [r9] \n" + + "add %0, #4 \n" // src += 4 + "add %2, %2, %3, lsl #2 \n" // dst += 4 * dst_stride + "subs %4, #4 \n" // w -= 4 + "beq 4f \n" + + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %4, #2 \n" + "blt 3f \n" + + // 2x8 block + "2: \n" + "mov r9, %0 \n" + "vld1.16 {d0[0]}, [r9], %1 \n" + "vld1.16 {d1[0]}, [r9], %1 \n" + "vld1.16 {d0[1]}, [r9], %1 \n" + "vld1.16 {d1[1]}, [r9], %1 \n" + "vld1.16 {d0[2]}, [r9], %1 \n" + "vld1.16 {d1[2]}, [r9], %1 \n" + "vld1.16 {d0[3]}, [r9], %1 \n" + "vld1.16 {d1[3]}, [r9] \n" + + "vtrn.8 d0, d1 \n" + + "mov r9, %2 \n" + + "vst1.64 {d0}, [r9], %3 \n" + "vst1.64 {d1}, [r9] \n" + + "add %0, #2 \n" // src += 2 + "add %2, %2, %3, lsl #1 \n" // dst += 2 * dst_stride + "subs %4, #2 \n" // w -= 2 + "beq 4f \n" + + // 1x8 block + "3: \n" + "vld1.8 {d0[0]}, [%0], %1 \n" + "vld1.8 {d0[1]}, [%0], %1 \n" + "vld1.8 {d0[2]}, [%0], %1 \n" + "vld1.8 {d0[3]}, [%0], %1 \n" + "vld1.8 {d0[4]}, [%0], %1 \n" + "vld1.8 {d0[5]}, [%0], %1 \n" + "vld1.8 {d0[6]}, [%0], %1 \n" + "vld1.8 {d0[7]}, [%0] \n" + + "vst1.64 {d0}, [%2] \n" + + "4: \n" + + : "+r"(src), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_stride), // %3 + "+r"(width) // %4 + : "r"(&kVTbl4x4Transpose) // %5 + : "memory", "cc", "r9", "q0", "q1", "q2", "q3" + ); +} + +static const uvec8 kVTbl4x4TransposeDi = + { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 }; + +void TransposeUVWx8_NEON(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width) { + asm volatile ( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %6, #8 \n" + + // handle 8x8 blocks. this should be the majority of the plane + ".p2align 4 \n" + "1: \n" + "mov r9, %0 \n" + + "vld2.8 {d0, d1}, [r9], %1 \n" + "vld2.8 {d2, d3}, [r9], %1 \n" + "vld2.8 {d4, d5}, [r9], %1 \n" + "vld2.8 {d6, d7}, [r9], %1 \n" + "vld2.8 {d16, d17}, [r9], %1 \n" + "vld2.8 {d18, d19}, [r9], %1 \n" + "vld2.8 {d20, d21}, [r9], %1 \n" + "vld2.8 {d22, d23}, [r9] \n" + + "vtrn.8 q1, q0 \n" + "vtrn.8 q3, q2 \n" + "vtrn.8 q9, q8 \n" + "vtrn.8 q11, q10 \n" + + "vtrn.16 q1, q3 \n" + "vtrn.16 q0, q2 \n" + "vtrn.16 q9, q11 \n" + "vtrn.16 q8, q10 \n" + + "vtrn.32 q1, q9 \n" + "vtrn.32 q0, q8 \n" + "vtrn.32 q3, q11 \n" + "vtrn.32 q2, q10 \n" + + "vrev16.8 q0, q0 \n" + "vrev16.8 q1, q1 \n" + "vrev16.8 q2, q2 \n" + "vrev16.8 q3, q3 \n" + "vrev16.8 q8, q8 \n" + "vrev16.8 q9, q9 \n" + "vrev16.8 q10, q10 \n" + "vrev16.8 q11, q11 \n" + + "mov r9, %2 \n" + + "vst1.8 {d2}, [r9], %3 \n" + "vst1.8 {d0}, [r9], %3 \n" + "vst1.8 {d6}, [r9], %3 \n" + "vst1.8 {d4}, [r9], %3 \n" + "vst1.8 {d18}, [r9], %3 \n" + "vst1.8 {d16}, [r9], %3 \n" + "vst1.8 {d22}, [r9], %3 \n" + "vst1.8 {d20}, [r9] \n" + + "mov r9, %4 \n" + + "vst1.8 {d3}, [r9], %5 \n" + "vst1.8 {d1}, [r9], %5 \n" + "vst1.8 {d7}, [r9], %5 \n" + "vst1.8 {d5}, [r9], %5 \n" + "vst1.8 {d19}, [r9], %5 \n" + "vst1.8 {d17}, [r9], %5 \n" + "vst1.8 {d23}, [r9], %5 \n" + "vst1.8 {d21}, [r9] \n" + + "add %0, #8*2 \n" // src += 8*2 + "add %2, %2, %3, lsl #3 \n" // dst_a += 8 * dst_stride_a + "add %4, %4, %5, lsl #3 \n" // dst_b += 8 * dst_stride_b + "subs %6, #8 \n" // w -= 8 + "bge 1b \n" + + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %6, #8 \n" + "beq 4f \n" + + // some residual, so between 1 and 7 lines left to transpose + "cmp %6, #2 \n" + "blt 3f \n" + + "cmp %6, #4 \n" + "blt 2f \n" + + //TODO(frkoenig) : clean this up + // 4x8 block + "mov r9, %0 \n" + "vld1.64 {d0}, [r9], %1 \n" + "vld1.64 {d1}, [r9], %1 \n" + "vld1.64 {d2}, [r9], %1 \n" + "vld1.64 {d3}, [r9], %1 \n" + "vld1.64 {d4}, [r9], %1 \n" + "vld1.64 {d5}, [r9], %1 \n" + "vld1.64 {d6}, [r9], %1 \n" + "vld1.64 {d7}, [r9] \n" + + "vld1.8 {q15}, [%7] \n" + + "vtrn.8 q0, q1 \n" + "vtrn.8 q2, q3 \n" + + "vtbl.8 d16, {d0, d1}, d30 \n" + "vtbl.8 d17, {d0, d1}, d31 \n" + "vtbl.8 d18, {d2, d3}, d30 \n" + "vtbl.8 d19, {d2, d3}, d31 \n" + "vtbl.8 d20, {d4, d5}, d30 \n" + "vtbl.8 d21, {d4, d5}, d31 \n" + "vtbl.8 d22, {d6, d7}, d30 \n" + "vtbl.8 d23, {d6, d7}, d31 \n" + + "mov r9, %2 \n" + + "vst1.32 {d16[0]}, [r9], %3 \n" + "vst1.32 {d16[1]}, [r9], %3 \n" + "vst1.32 {d17[0]}, [r9], %3 \n" + "vst1.32 {d17[1]}, [r9], %3 \n" + + "add r9, %2, #4 \n" + "vst1.32 {d20[0]}, [r9], %3 \n" + "vst1.32 {d20[1]}, [r9], %3 \n" + "vst1.32 {d21[0]}, [r9], %3 \n" + "vst1.32 {d21[1]}, [r9] \n" + + "mov r9, %4 \n" + + "vst1.32 {d18[0]}, [r9], %5 \n" + "vst1.32 {d18[1]}, [r9], %5 \n" + "vst1.32 {d19[0]}, [r9], %5 \n" + "vst1.32 {d19[1]}, [r9], %5 \n" + + "add r9, %4, #4 \n" + "vst1.32 {d22[0]}, [r9], %5 \n" + "vst1.32 {d22[1]}, [r9], %5 \n" + "vst1.32 {d23[0]}, [r9], %5 \n" + "vst1.32 {d23[1]}, [r9] \n" + + "add %0, #4*2 \n" // src += 4 * 2 + "add %2, %2, %3, lsl #2 \n" // dst_a += 4 * dst_stride_a + "add %4, %4, %5, lsl #2 \n" // dst_b += 4 * dst_stride_b + "subs %6, #4 \n" // w -= 4 + "beq 4f \n" + + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %6, #2 \n" + "blt 3f \n" + + // 2x8 block + "2: \n" + "mov r9, %0 \n" + "vld2.16 {d0[0], d2[0]}, [r9], %1 \n" + "vld2.16 {d1[0], d3[0]}, [r9], %1 \n" + "vld2.16 {d0[1], d2[1]}, [r9], %1 \n" + "vld2.16 {d1[1], d3[1]}, [r9], %1 \n" + "vld2.16 {d0[2], d2[2]}, [r9], %1 \n" + "vld2.16 {d1[2], d3[2]}, [r9], %1 \n" + "vld2.16 {d0[3], d2[3]}, [r9], %1 \n" + "vld2.16 {d1[3], d3[3]}, [r9] \n" + + "vtrn.8 d0, d1 \n" + "vtrn.8 d2, d3 \n" + + "mov r9, %2 \n" + + "vst1.64 {d0}, [r9], %3 \n" + "vst1.64 {d2}, [r9] \n" + + "mov r9, %4 \n" + + "vst1.64 {d1}, [r9], %5 \n" + "vst1.64 {d3}, [r9] \n" + + "add %0, #2*2 \n" // src += 2 * 2 + "add %2, %2, %3, lsl #1 \n" // dst_a += 2 * dst_stride_a + "add %4, %4, %5, lsl #1 \n" // dst_b += 2 * dst_stride_b + "subs %6, #2 \n" // w -= 2 + "beq 4f \n" + + // 1x8 block + "3: \n" + "vld2.8 {d0[0], d1[0]}, [%0], %1 \n" + "vld2.8 {d0[1], d1[1]}, [%0], %1 \n" + "vld2.8 {d0[2], d1[2]}, [%0], %1 \n" + "vld2.8 {d0[3], d1[3]}, [%0], %1 \n" + "vld2.8 {d0[4], d1[4]}, [%0], %1 \n" + "vld2.8 {d0[5], d1[5]}, [%0], %1 \n" + "vld2.8 {d0[6], d1[6]}, [%0], %1 \n" + "vld2.8 {d0[7], d1[7]}, [%0] \n" + + "vst1.64 {d0}, [%2] \n" + "vst1.64 {d1}, [%4] \n" + + "4: \n" + + : "+r"(src), // %0 + "+r"(src_stride), // %1 + "+r"(dst_a), // %2 + "+r"(dst_stride_a), // %3 + "+r"(dst_b), // %4 + "+r"(dst_stride_b), // %5 + "+r"(width) // %6 + : "r"(&kVTbl4x4TransposeDi) // %7 + : "memory", "cc", "r9", + "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" + ); +} +#endif + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/files/source/rotate_neon.s b/files/source/rotate_neon.s deleted file mode 100644 index 75ea957a..00000000 --- a/files/source/rotate_neon.s +++ /dev/null @@ -1,563 +0,0 @@ - .global RestoreRegisters_NEON - .global ReverseLine_NEON - .global ReverseLineUV_NEON - .global SaveRegisters_NEON - .global TransposeWx8_NEON - .global TransposeUVWx8_NEON - .type RestoreRegisters_NEON, function - .type ReverseLine_NEON, function - .type ReverseLineUV_NEON, function - .type SaveRegisters_NEON, function - .type TransposeWx8_NEON, function - .type TransposeUVWx8_NEON, function - -@ void ReverseLine_NEON (const uint8* src, uint8* dst, int width) -@ r0 const uint8* src -@ r1 uint8* dst -@ r2 width -ReverseLine_NEON: - - @ compute where to start writing destination - add r1, r2 @ dst + width - - @ work on segments that are multiples of 16 - lsrs r3, r2, #4 - - @ the output is written in two block. 8 bytes followed - @ by another 8. reading is done sequentially, from left to - @ right. writing is done from right to left in block sizes - @ r1, the destination pointer is incremented after writing - @ the first of the two blocks. need to subtract that 8 off - @ along with 16 to get the next location. - mov r3, #-24 - - beq Lline_residuals - - @ back of destination by the size of the register that is - @ going to be reversed - sub r1, #16 - - @ the loop needs to run on blocks of 16. what will be left - @ over is either a negative number, the residuals that need - @ to be done, or 0. if this isn't subtracted off here the - @ loop will run one extra time. - sub r2, #16 - -Lsegments_of_16: - vld1.8 {q0}, [r0]! @ src += 16 - - @ reverse the bytes in the 64 bit segments. unable to reverse - @ the bytes in the entire 128 bits in one go. - vrev64.8 q0, q0 - - @ because of the inability to reverse the entire 128 bits - @ reverse the writing out of the two 64 bit segments. - vst1.8 {d1}, [r1]! - vst1.8 {d0}, [r1], r3 @ dst -= 16 - - subs r2, #16 - bge Lsegments_of_16 - - @ add 16 back to the counter. if the result is 0 there is no - @ residuals so return - adds r2, #16 - bxeq lr - - add r1, #16 - -Lline_residuals: - - mov r3, #-3 - - sub r1, #2 - subs r2, #2 - @ check for 16*n+1 scenarios where segments_of_2 should not - @ be run, but there is something left over. - blt Lsegment_of_1 - -@ do this in neon registers as per -@ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/ -Lsegments_of_2: - vld2.8 {d0[0], d1[0]}, [r0]! @ src += 2 - - vst1.8 {d1[0]}, [r1]! - vst1.8 {d0[0]}, [r1], r3 @ dst -= 2 - - subs r2, #2 - bge Lsegments_of_2 - - adds r2, #2 - bxeq lr - -Lsegment_of_1: - add r1, #1 - vld1.8 {d0[0]}, [r0] - vst1.8 {d0[0]}, [r1] - - bx lr - -@ void TransposeWx8_NEON (const uint8* src, int src_stride, -@ uint8* dst, int dst_stride, -@ int w) -@ r0 const uint8* src -@ r1 int src_stride -@ r2 uint8* dst -@ r3 int dst_stride -@ stack int w -TransposeWx8_NEON: - push {r4,r8,r9,lr} - - ldr r8, [sp, #16] @ width - - @ loops are on blocks of 8. loop will stop when - @ counter gets to or below 0. starting the counter - @ at w-8 allow for this - sub r8, #8 - -@ handle 8x8 blocks. this should be the majority of the plane -Lloop_8x8: - mov r9, r0 - - vld1.8 {d0}, [r9], r1 - vld1.8 {d1}, [r9], r1 - vld1.8 {d2}, [r9], r1 - vld1.8 {d3}, [r9], r1 - vld1.8 {d4}, [r9], r1 - vld1.8 {d5}, [r9], r1 - vld1.8 {d6}, [r9], r1 - vld1.8 {d7}, [r9] - - vtrn.8 d1, d0 - vtrn.8 d3, d2 - vtrn.8 d5, d4 - vtrn.8 d7, d6 - - vtrn.16 d1, d3 - vtrn.16 d0, d2 - vtrn.16 d5, d7 - vtrn.16 d4, d6 - - vtrn.32 d1, d5 - vtrn.32 d0, d4 - vtrn.32 d3, d7 - vtrn.32 d2, d6 - - vrev16.8 q0, q0 - vrev16.8 q1, q1 - vrev16.8 q2, q2 - vrev16.8 q3, q3 - - mov r9, r2 - - vst1.8 {d1}, [r9], r3 - vst1.8 {d0}, [r9], r3 - vst1.8 {d3}, [r9], r3 - vst1.8 {d2}, [r9], r3 - vst1.8 {d5}, [r9], r3 - vst1.8 {d4}, [r9], r3 - vst1.8 {d7}, [r9], r3 - vst1.8 {d6}, [r9] - - add r0, #8 @ src += 8 - add r2, r3, lsl #3 @ dst += 8 * dst_stride - subs r8, #8 @ w -= 8 - bge Lloop_8x8 - - @ add 8 back to counter. if the result is 0 there are - @ no residuals. - adds r8, #8 - beq Ldone - - @ some residual, so between 1 and 7 lines left to transpose - cmp r8, #2 - blt Lblock_1x8 - - cmp r8, #4 - blt Lblock_2x8 - -Lblock_4x8: - mov r9, r0 - vld1.32 {d0[0]}, [r9], r1 - vld1.32 {d0[1]}, [r9], r1 - vld1.32 {d1[0]}, [r9], r1 - vld1.32 {d1[1]}, [r9], r1 - vld1.32 {d2[0]}, [r9], r1 - vld1.32 {d2[1]}, [r9], r1 - vld1.32 {d3[0]}, [r9], r1 - vld1.32 {d3[1]}, [r9] - - mov r9, r2 - - adr r12, vtbl_4x4_transpose - vld1.8 {q3}, [r12] - - vtbl.8 d4, {d0, d1}, d6 - vtbl.8 d5, {d0, d1}, d7 - vtbl.8 d0, {d2, d3}, d6 - vtbl.8 d1, {d2, d3}, d7 - - @ TODO: rework shuffle above to write - @ out with 4 instead of 8 writes - vst1.32 {d4[0]}, [r9], r3 - vst1.32 {d4[1]}, [r9], r3 - vst1.32 {d5[0]}, [r9], r3 - vst1.32 {d5[1]}, [r9] - - add r9, r2, #4 - vst1.32 {d0[0]}, [r9], r3 - vst1.32 {d0[1]}, [r9], r3 - vst1.32 {d1[0]}, [r9], r3 - vst1.32 {d1[1]}, [r9] - - add r0, #4 @ src += 4 - add r2, r3, lsl #2 @ dst += 4 * dst_stride - subs r8, #4 @ w -= 4 - beq Ldone - - @ some residual, check to see if it includes a 2x8 block, - @ or less - cmp r8, #2 - blt Lblock_1x8 - -Lblock_2x8: - mov r9, r0 - vld1.16 {d0[0]}, [r9], r1 - vld1.16 {d1[0]}, [r9], r1 - vld1.16 {d0[1]}, [r9], r1 - vld1.16 {d1[1]}, [r9], r1 - vld1.16 {d0[2]}, [r9], r1 - vld1.16 {d1[2]}, [r9], r1 - vld1.16 {d0[3]}, [r9], r1 - vld1.16 {d1[3]}, [r9] - - vtrn.8 d0, d1 - - mov r9, r2 - - vst1.64 {d0}, [r9], r3 - vst1.64 {d1}, [r9] - - add r0, #2 @ src += 2 - add r2, r3, lsl #1 @ dst += 2 * dst_stride - subs r8, #2 @ w -= 2 - beq Ldone - -Lblock_1x8: - vld1.8 {d0[0]}, [r0], r1 - vld1.8 {d0[1]}, [r0], r1 - vld1.8 {d0[2]}, [r0], r1 - vld1.8 {d0[3]}, [r0], r1 - vld1.8 {d0[4]}, [r0], r1 - vld1.8 {d0[5]}, [r0], r1 - vld1.8 {d0[6]}, [r0], r1 - vld1.8 {d0[7]}, [r0] - - vst1.64 {d0}, [r2] - -Ldone: - - pop {r4,r8,r9,pc} - -vtbl_4x4_transpose: - .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 - -@ void SaveRegisters_NEON (unsigned long long store) -@ r0 unsigned long long store -SaveRegisters_NEON: - vst1.i64 {d8, d9, d10, d11}, [r0]! - vst1.i64 {d12, d13, d14, d15}, [r0]! - bx lr - -@ void RestoreRegisters_NEON (unsigned long long store) -@ r0 unsigned long long store -RestoreRegisters_NEON: - vld1.i64 {d8, d9, d10, d11}, [r0]! - vld1.i64 {d12, d13, d14, d15}, [r0]! - bx lr - -@ void ReverseLineUV_NEON (const uint8* src, -@ uint8* dst_a, -@ uint8* dst_b, -@ int width) -@ r0 const uint8* src -@ r1 uint8* dst_a -@ r2 uint8* dst_b -@ r3 width -ReverseLineUV_NEON: - - @ compute where to start writing destination - add r1, r1, r3 @ dst_a + width - add r2, r2, r3 @ dst_b + width - - @ work on input segments that are multiples of 16, but - @ width that has been passed is output segments, half - @ the size of input. - lsrs r12, r3, #3 - - beq Lline_residuals_di - - @ the output is written in to two blocks. - mov r12, #-8 - - @ back of destination by the size of the register that is - @ going to be reversed - sub r1, r1, #8 - sub r2, r2, #8 - - @ the loop needs to run on blocks of 8. what will be left - @ over is either a negative number, the residuals that need - @ to be done, or 0. if this isn't subtracted off here the - @ loop will run one extra time. - sub r3, r3, #8 - -Lsegments_of_8_di: - vld2.8 {d0, d1}, [r0]! @ src += 16 - - @ reverse the bytes in the 64 bit segments - vrev64.8 q0, q0 - - vst1.8 {d0}, [r1], r12 @ dst_a -= 8 - vst1.8 {d1}, [r2], r12 @ dst_b -= 8 - - subs r3, r3, #8 - bge Lsegments_of_8_di - - @ add 8 back to the counter. if the result is 0 there is no - @ residuals so return - adds r3, r3, #8 - bxeq lr - - add r1, r1, #8 - add r2, r2, #8 - -Lline_residuals_di: - - mov r12, #-1 - - sub r1, r1, #1 - sub r2, r2, #1 - -@ do this in neon registers as per -@ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/ -Lsegments_of_1: - vld2.8 {d0[0], d1[0]}, [r0]! @ src += 2 - - vst1.8 {d0[0]}, [r1], r12 @ dst_a -= 1 - vst1.8 {d1[0]}, [r2], r12 @ dst_b -= 1 - - subs r3, r3, #1 - bgt Lsegments_of_1 - - bx lr - -@ void TransposeUVWx8_NEON (const uint8* src, int src_stride, -@ uint8* dst_a, int dst_stride_a, -@ uint8* dst_b, int dst_stride_b, -@ int width) -@ r0 const uint8* src -@ r1 int src_stride -@ r2 uint8* dst_a -@ r3 int dst_stride_a -@ stack uint8* dst_b -@ stack int dst_stride_b -@ stack int width -TransposeUVWx8_NEON: - push {r4-r9,lr} - - ldr r4, [sp, #28] @ dst_b - ldr r5, [sp, #32] @ dst_stride_b - ldr r8, [sp, #36] @ width - @ loops are on blocks of 8. loop will stop when - @ counter gets to or below 0. starting the counter - @ at w-8 allow for this - sub r8, #8 - -@ handle 8x8 blocks. this should be the majority of the plane -Lloop_8x8_di: - mov r9, r0 - - vld2.8 {d0, d1}, [r9], r1 - vld2.8 {d2, d3}, [r9], r1 - vld2.8 {d4, d5}, [r9], r1 - vld2.8 {d6, d7}, [r9], r1 - vld2.8 {d8, d9}, [r9], r1 - vld2.8 {d10, d11}, [r9], r1 - vld2.8 {d12, d13}, [r9], r1 - vld2.8 {d14, d15}, [r9] - - vtrn.8 q1, q0 - vtrn.8 q3, q2 - vtrn.8 q5, q4 - vtrn.8 q7, q6 - - vtrn.16 q1, q3 - vtrn.16 q0, q2 - vtrn.16 q5, q7 - vtrn.16 q4, q6 - - vtrn.32 q1, q5 - vtrn.32 q0, q4 - vtrn.32 q3, q7 - vtrn.32 q2, q6 - - vrev16.8 q0, q0 - vrev16.8 q1, q1 - vrev16.8 q2, q2 - vrev16.8 q3, q3 - vrev16.8 q4, q4 - vrev16.8 q5, q5 - vrev16.8 q6, q6 - vrev16.8 q7, q7 - - mov r9, r2 - - vst1.8 {d2}, [r9], r3 - vst1.8 {d0}, [r9], r3 - vst1.8 {d6}, [r9], r3 - vst1.8 {d4}, [r9], r3 - vst1.8 {d10}, [r9], r3 - vst1.8 {d8}, [r9], r3 - vst1.8 {d14}, [r9], r3 - vst1.8 {d12}, [r9] - - mov r9, r4 - - vst1.8 {d3}, [r9], r5 - vst1.8 {d1}, [r9], r5 - vst1.8 {d7}, [r9], r5 - vst1.8 {d5}, [r9], r5 - vst1.8 {d11}, [r9], r5 - vst1.8 {d9}, [r9], r5 - vst1.8 {d15}, [r9], r5 - vst1.8 {d13}, [r9] - - add r0, #8*2 @ src += 8*2 - add r2, r3, lsl #3 @ dst_a += 8 * dst_stride_a - add r4, r5, lsl #3 @ dst_b += 8 * dst_stride_b - subs r8, #8 @ w -= 8 - bge Lloop_8x8_di - - @ add 8 back to counter. if the result is 0 there are - @ no residuals. - adds r8, #8 - beq Ldone_di - - @ some residual, so between 1 and 7 lines left to transpose - cmp r8, #2 - blt Lblock_1x8_di - - cmp r8, #4 - blt Lblock_2x8_di - -@ TODO(frkoenig) : clean this up -Lblock_4x8_di: - mov r9, r0 - vld1.64 {d0}, [r9], r1 - vld1.64 {d1}, [r9], r1 - vld1.64 {d2}, [r9], r1 - vld1.64 {d3}, [r9], r1 - vld1.64 {d4}, [r9], r1 - vld1.64 {d5}, [r9], r1 - vld1.64 {d6}, [r9], r1 - vld1.64 {d7}, [r9] - - adr r12, vtbl_4x4_transpose_di - vld1.8 {q7}, [r12] - - vtrn.8 q0, q1 - vtrn.8 q2, q3 - - vtbl.8 d8, {d0, d1}, d14 - vtbl.8 d9, {d0, d1}, d15 - vtbl.8 d10, {d2, d3}, d14 - vtbl.8 d11, {d2, d3}, d15 - vtbl.8 d12, {d4, d5}, d14 - vtbl.8 d13, {d4, d5}, d15 - vtbl.8 d0, {d6, d7}, d14 - vtbl.8 d1, {d6, d7}, d15 - - mov r9, r2 - - vst1.32 {d8[0]}, [r9], r3 - vst1.32 {d8[1]}, [r9], r3 - vst1.32 {d9[0]}, [r9], r3 - vst1.32 {d9[1]}, [r9], r3 - - add r9, r2, #4 - vst1.32 {d12[0]}, [r9], r3 - vst1.32 {d12[1]}, [r9], r3 - vst1.32 {d13[0]}, [r9], r3 - vst1.32 {d13[1]}, [r9] - - mov r9, r4 - - vst1.32 {d10[0]}, [r9], r5 - vst1.32 {d10[1]}, [r9], r5 - vst1.32 {d11[0]}, [r9], r5 - vst1.32 {d11[1]}, [r9], r5 - - add r9, r4, #4 - vst1.32 {d0[0]}, [r9], r5 - vst1.32 {d0[1]}, [r9], r5 - vst1.32 {d1[0]}, [r9], r5 - vst1.32 {d1[1]}, [r9] - - add r0, #4*2 @ src += 4 * 2 - add r2, r3, lsl #2 @ dst_a += 4 * dst_stride_a - add r4, r5, lsl #2 @ dst_b += 4 * dst_stride_b - subs r8, #4 @ w -= 4 - beq Ldone_di - - @ some residual, check to see if it includes a 2x8 block, - @ or less - cmp r8, #2 - blt Lblock_1x8_di - -Lblock_2x8_di: - mov r9, r0 - vld2.16 {d0[0], d2[0]}, [r9], r1 - vld2.16 {d1[0], d3[0]}, [r9], r1 - vld2.16 {d0[1], d2[1]}, [r9], r1 - vld2.16 {d1[1], d3[1]}, [r9], r1 - vld2.16 {d0[2], d2[2]}, [r9], r1 - vld2.16 {d1[2], d3[2]}, [r9], r1 - vld2.16 {d0[3], d2[3]}, [r9], r1 - vld2.16 {d1[3], d3[3]}, [r9] - - vtrn.8 d0, d1 - vtrn.8 d2, d3 - - mov r9, r2 - - vst1.64 {d0}, [r9], r3 - vst1.64 {d2}, [r9] - - mov r9, r4 - - vst1.64 {d1}, [r9], r5 - vst1.64 {d3}, [r9] - - add r0, #2*2 @ src += 2 * 2 - add r2, r3, lsl #1 @ dst_a += 2 * dst_stride_a - add r4, r5, lsl #1 @ dst_a += 2 * dst_stride_a - subs r8, #2 @ w -= 2 - beq Ldone_di - -Lblock_1x8_di: - vld2.8 {d0[0], d1[0]}, [r0], r1 - vld2.8 {d0[1], d1[1]}, [r0], r1 - vld2.8 {d0[2], d1[2]}, [r0], r1 - vld2.8 {d0[3], d1[3]}, [r0], r1 - vld2.8 {d0[4], d1[4]}, [r0], r1 - vld2.8 {d0[5], d1[5]}, [r0], r1 - vld2.8 {d0[6], d1[6]}, [r0], r1 - vld2.8 {d0[7], d1[7]}, [r0] - - vst1.64 {d0}, [r2] - vst1.64 {d1}, [r4] - -Ldone_di: - pop {r4-r9, pc} - -vtbl_4x4_transpose_di: - .byte 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 diff --git a/files/source/rotate_priv.h b/files/source/rotate_priv.h deleted file mode 100644 index b4df1494..00000000 --- a/files/source/rotate_priv.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef SOURCE_ROTATE_PRIV_H_ -#define SOURCE_ROTATE_PRIV_H_ - -#include "libyuv/basic_types.h" - -namespace libyuv { - -// Rotate planes by 90, 180, 270 -void -RotatePlane90(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height); - -void -RotatePlane180(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height); - -void -RotatePlane270(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height); - -void -RotateUV90(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width, int height); - -// Rotations for when U and V are interleaved. -// These functions take one input pointer and -// split the data into two buffers while -// rotating them. -void -RotateUV180(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width, int height); - -void -RotateUV270(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width, int height); - -// The 90 and 270 functions are based on transposes. -// Doing a transpose with reversing the read/write -// order will result in a rotation by +- 90 degrees. -void -TransposePlane(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height); - -void -TransposeUV(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width, int height); - -} // namespace libyuv - -#endif // SOURCE_ROTATE_PRIV_H_ diff --git a/files/source/row.h b/files/source/row.h deleted file mode 100644 index 85343c56..00000000 --- a/files/source/row.h +++ /dev/null @@ -1,167 +0,0 @@ -/* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef LIBYUV_SOURCE_ROW_H_ -#define LIBYUV_SOURCE_ROW_H_ - -#include "libyuv/basic_types.h" - -// The following are available on all x86 platforms -#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \ - && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) -#define HAS_ARGBTOYROW_SSSE3 -#define HAS_BG24TOARGBROW_SSSE3 -#define HAS_RAWTOARGBROW_SSSE3 -#define HAS_RGB24TOYROW_SSSE3 -#define HAS_RAWTOYROW_SSSE3 -#define HAS_RGB24TOUVROW_SSSE3 -#define HAS_RAWTOUVROW_SSSE3 -#endif - -// The following are available only on Windows -#if defined(WIN32) \ - && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) -#define HAS_BGRATOYROW_SSSE3 -#define HAS_ABGRTOYROW_SSSE3 -#define HAS_ARGBTOUVROW_SSSE3 -#define HAS_BGRATOUVROW_SSSE3 -#define HAS_ABGRTOUVROW_SSSE3 -#endif - -extern "C" { -#ifdef HAS_ARGBTOYROW_SSSE3 -void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); -void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); -void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); -void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -#endif -#if defined(HAS_BG24TOARGBROW_SSSE3) && defined(HAS_ARGBTOYROW_SSSE3) -#define HASRGB24TOYROW_SSSE3 -#endif -#ifdef HASRGB24TOYROW_SSSE3 -void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); -void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); -void RGB24ToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void RAWToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -#endif -void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); -void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); -void ABGRToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); -void RGB24ToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); -void RAWToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); -void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void BGRAToUVRow_C(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void ABGRToUVRow_C(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void RGB24ToUVRow_C(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void RAWToUVRow_C(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); - -#ifdef HAS_BG24TOARGBROW_SSSE3 -void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix); -void RAWToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix); -#endif -void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix); -void RAWToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix); - -#if defined(_MSC_VER) -#define SIMD_ALIGNED(var) __declspec(align(16)) var -#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var -#else -#define SIMD_ALIGNED(var) var __attribute__((aligned(16))) -#define TALIGN16(t, var) t var __attribute__((aligned(16))) -#endif - -#ifdef OSX -extern SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]); -extern SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]); -extern SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]); -#else -extern SIMD_ALIGNED(const int16 _kCoefficientsRgbY[768][4]); -extern SIMD_ALIGNED(const int16 _kCoefficientsBgraY[768][4]); -extern SIMD_ALIGNED(const int16 _kCoefficientsAbgrY[768][4]); -#endif -void FastConvertYUVToRGB32Row(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - -void FastConvertYUVToBGRARow(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - -void FastConvertYUVToABGRRow(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - -void FastConvertYUV444ToRGB32Row(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - -void FastConvertYToRGB32Row(const uint8* y_buf, - uint8* rgb_buf, - int width); - -// Method to force C version. -//#define USE_MMX 0 -//#define USE_SSE2 0 - -#if !defined(USE_MMX) -// Windows, Mac and Linux use MMX -#if defined(__i386__) || defined(_MSC_VER) -#define USE_MMX 1 -#else -#define USE_MMX 0 -#endif -#endif - -#if !defined(USE_SSE2) -#if defined(__SSE2__) || defined(ARCH_CPU_X86_64) || _M_IX86_FP==2 -#define USE_SSE2 1 -#else -#define USE_SSE2 0 -#endif -#endif - -// x64 uses MMX2 (SSE) so emms is not required. -// Warning C4799: function has no EMMS instruction. -// EMMS() is slow and should be called by the calling function once per image. -#if USE_MMX && !defined(ARCH_CPU_X86_64) -#if defined(_MSC_VER) -#define EMMS() __asm emms -#pragma warning(disable: 4799) -#else -#define EMMS() asm("emms") -#endif -#else -#define EMMS() -#endif - - -} // extern "C" - -#endif // LIBYUV_SOURCE_ROW_H_ diff --git a/files/source/row_common.cc b/files/source/row_common.cc new file mode 100644 index 00000000..c5f3ce05 --- /dev/null +++ b/files/source/row_common.cc @@ -0,0 +1,1246 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#include <string.h> // For memcpy + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int width) { + for (int x = 0; x < width; ++x) { + // To support in-place conversion. + uint8 a = src_bgra[0]; + uint8 r = src_bgra[1]; + uint8 g = src_bgra[2]; + uint8 b = src_bgra[3]; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = a; + dst_argb += 4; + src_bgra += 4; + } +} + +void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int width) { + for (int x = 0; x < width; ++x) { + // To support in-place conversion. + uint8 r = src_abgr[0]; + uint8 g = src_abgr[1]; + uint8 b = src_abgr[2]; + uint8 a = src_abgr[3]; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = a; + dst_argb += 4; + src_abgr += 4; + } +} + +void RGBAToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int width) { + for (int x = 0; x < width; ++x) { + // To support in-place conversion. + uint8 a = src_abgr[0]; + uint8 b = src_abgr[1]; + uint8 g = src_abgr[2]; + uint8 r = src_abgr[3]; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = a; + dst_argb += 4; + src_abgr += 4; + } +} + +void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) { + for (int x = 0; x < width; ++x) { + uint8 b = src_rgb24[0]; + uint8 g = src_rgb24[1]; + uint8 r = src_rgb24[2]; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = 255u; + dst_argb += 4; + src_rgb24 += 3; + } +} + +void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) { + for (int x = 0; x < width; ++x) { + uint8 r = src_raw[0]; + uint8 g = src_raw[1]; + uint8 b = src_raw[2]; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = 255u; + dst_argb += 4; + src_raw += 3; + } +} + +void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) { + for (int x = 0; x < width; ++x) { + uint8 b = src_rgb[0] & 0x1f; + uint8 g = (src_rgb[0] >> 5) | ((src_rgb[1] & 0x07) << 3); + uint8 r = src_rgb[1] >> 3; + dst_argb[0] = (b << 3) | (b >> 2); + dst_argb[1] = (g << 2) | (g >> 4); + dst_argb[2] = (r << 3) | (r >> 2); + dst_argb[3] = 255u; + dst_argb += 4; + src_rgb += 2; + } +} + +void ARGB1555ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) { + for (int x = 0; x < width; ++x) { + uint8 b = src_rgb[0] & 0x1f; + uint8 g = (src_rgb[0] >> 5) | ((src_rgb[1] & 0x03) << 3); + uint8 r = (src_rgb[1] & 0x7c) >> 2; + uint8 a = src_rgb[1] >> 7; + dst_argb[0] = (b << 3) | (b >> 2); + dst_argb[1] = (g << 3) | (g >> 2); + dst_argb[2] = (r << 3) | (r >> 2); + dst_argb[3] = -a; + dst_argb += 4; + src_rgb += 2; + } +} + +void ARGB4444ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) { + for (int x = 0; x < width; ++x) { + uint8 b = src_rgb[0] & 0x0f; + uint8 g = src_rgb[0] >> 4; + uint8 r = src_rgb[1] & 0x0f; + uint8 a = src_rgb[1] >> 4; + dst_argb[0] = (b << 4) | b; + dst_argb[1] = (g << 4) | g; + dst_argb[2] = (r << 4) | r; + dst_argb[3] = (a << 4) | a; + dst_argb += 4; + src_rgb += 2; + } +} + +void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int width) { + for (int x = 0; x < width; ++x) { + uint8 b = src_argb[0]; + uint8 g = src_argb[1]; + uint8 r = src_argb[2]; + uint8 a = src_argb[3]; + dst_rgb[0] = a; + dst_rgb[1] = b; + dst_rgb[2] = g; + dst_rgb[3] = r; + dst_rgb += 4; + src_argb += 4; + } +} + +void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { + for (int x = 0; x < width; ++x) { + uint8 b = src_argb[0]; + uint8 g = src_argb[1]; + uint8 r = src_argb[2]; + dst_rgb[0] = b; + dst_rgb[1] = g; + dst_rgb[2] = r; + dst_rgb += 3; + src_argb += 4; + } +} + +void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) { + for (int x = 0; x < width; ++x) { + uint8 b = src_argb[0]; + uint8 g = src_argb[1]; + uint8 r = src_argb[2]; + dst_rgb[0] = r; + dst_rgb[1] = g; + dst_rgb[2] = b; + dst_rgb += 3; + src_argb += 4; + } +} + +// TODO(fbarchard): support big endian CPU +void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { + for (int x = 0; x < width - 1; x += 2) { + uint8 b0 = src_argb[0] >> 3; + uint8 g0 = src_argb[1] >> 2; + uint8 r0 = src_argb[2] >> 3; + uint8 b1 = src_argb[4] >> 3; + uint8 g1 = src_argb[5] >> 2; + uint8 r1 = src_argb[6] >> 3; + *reinterpret_cast<uint32*>(dst_rgb) = b0 | (g0 << 5) | (r0 << 11) | + (b1 << 16) | (g1 << 21) | (r1 << 27); + dst_rgb += 4; + src_argb += 8; + } + if (width & 1) { + uint8 b0 = src_argb[0] >> 3; + uint8 g0 = src_argb[1] >> 2; + uint8 r0 = src_argb[2] >> 3; + *reinterpret_cast<uint16*>(dst_rgb) = b0 | (g0 << 5) | (r0 << 11); + } +} + +void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { + for (int x = 0; x < width - 1; x += 2) { + uint8 b0 = src_argb[0] >> 3; + uint8 g0 = src_argb[1] >> 3; + uint8 r0 = src_argb[2] >> 3; + uint8 a0 = src_argb[3] >> 7; + uint8 b1 = src_argb[4] >> 3; + uint8 g1 = src_argb[5] >> 3; + uint8 r1 = src_argb[6] >> 3; + uint8 a1 = src_argb[7] >> 7; + *reinterpret_cast<uint32*>(dst_rgb) = + b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) | + (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31); + dst_rgb += 4; + src_argb += 8; + } + if (width & 1) { + uint8 b0 = src_argb[0] >> 3; + uint8 g0 = src_argb[1] >> 3; + uint8 r0 = src_argb[2] >> 3; + uint8 a0 = src_argb[3] >> 7; + *reinterpret_cast<uint16*>(dst_rgb) = + b0 | (g0 << 5) | (r0 << 10) | (a0 << 15); + } +} + +void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { + for (int x = 0; x < width - 1; x += 2) { + uint8 b0 = src_argb[0] >> 4; + uint8 g0 = src_argb[1] >> 4; + uint8 r0 = src_argb[2] >> 4; + uint8 a0 = src_argb[3] >> 4; + uint8 b1 = src_argb[4] >> 4; + uint8 g1 = src_argb[5] >> 4; + uint8 r1 = src_argb[6] >> 4; + uint8 a1 = src_argb[7] >> 4; + *reinterpret_cast<uint32*>(dst_rgb) = + b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) | + (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28); + dst_rgb += 4; + src_argb += 8; + } + if (width & 1) { + uint8 b0 = src_argb[0] >> 4; + uint8 g0 = src_argb[1] >> 4; + uint8 r0 = src_argb[2] >> 4; + uint8 a0 = src_argb[3] >> 4; + *reinterpret_cast<uint16*>(dst_rgb) = + b0 | (g0 << 4) | (r0 << 8) | (a0 << 12); + } +} + +static __inline int RGBToY(uint8 r, uint8 g, uint8 b) { + return (( 66 * r + 129 * g + 25 * b + 128) >> 8) + 16; +} + +static __inline int RGBToU(uint8 r, uint8 g, uint8 b) { + return ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; +} +static __inline int RGBToV(uint8 r, uint8 g, uint8 b) { + return ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; +} + +#define MAKEROWY(NAME, R, G, B) \ +void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \ + for (int x = 0; x < width; ++x) { \ + dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \ + src_argb0 += 4; \ + dst_y += 1; \ + } \ +} \ +void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb, \ + uint8* dst_u, uint8* dst_v, int width) { \ + const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \ + for (int x = 0; x < width - 1; x += 2) { \ + uint8 ab = (src_rgb0[B] + src_rgb0[B + 4] + \ + src_rgb1[B] + src_rgb1[B + 4]) >> 2; \ + uint8 ag = (src_rgb0[G] + src_rgb0[G + 4] + \ + src_rgb1[G] + src_rgb1[G + 4]) >> 2; \ + uint8 ar = (src_rgb0[R] + src_rgb0[R + 4] + \ + src_rgb1[R] + src_rgb1[R + 4]) >> 2; \ + dst_u[0] = RGBToU(ar, ag, ab); \ + dst_v[0] = RGBToV(ar, ag, ab); \ + src_rgb0 += 8; \ + src_rgb1 += 8; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \ + uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \ + uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \ + dst_u[0] = RGBToU(ar, ag, ab); \ + dst_v[0] = RGBToV(ar, ag, ab); \ + } \ +} + +MAKEROWY(ARGB, 2, 1, 0) +MAKEROWY(BGRA, 1, 2, 3) +MAKEROWY(ABGR, 0, 1, 2) +MAKEROWY(RGBA, 3, 2, 1) + +// http://en.wikipedia.org/wiki/Grayscale. +// 0.11 * B + 0.59 * G + 0.30 * R +// Coefficients rounded to multiple of 2 for consistency with SSSE3 version. +static __inline int RGBToGray(uint8 r, uint8 g, uint8 b) { + return (( 76 * r + 152 * g + 28 * b) >> 8); +} + +void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) { + for (int x = 0; x < width; ++x) { + uint8 y = RGBToGray(src_argb[2], src_argb[1], src_argb[0]); + dst_argb[2] = dst_argb[1] = dst_argb[0] = y; + dst_argb[3] = src_argb[3]; + dst_argb += 4; + src_argb += 4; + } +} + +// Convert a row of image to Sepia tone. +void ARGBSepiaRow_C(uint8* dst_argb, int width) { + for (int x = 0; x < width; ++x) { + int b = dst_argb[0]; + int g = dst_argb[1]; + int r = dst_argb[2]; + int sb = (b * 17 + g * 68 + r * 35) >> 7; + int sg = (b * 22 + g * 88 + r * 45) >> 7; + int sr = (b * 24 + g * 98 + r * 50) >> 7; + // b does not over flow. a is preserved from original. + if (sg > 255) { + sg = 255; + } + if (sr > 255) { + sr = 255; + } + dst_argb[0] = sb; + dst_argb[1] = sg; + dst_argb[2] = sr; + dst_argb += 4; + } +} + +// Apply color matrix to a row of image. Matrix is signed. +void ARGBColorMatrixRow_C(uint8* dst_argb, const int8* matrix_argb, int width) { + for (int x = 0; x < width; ++x) { + int b = dst_argb[0]; + int g = dst_argb[1]; + int r = dst_argb[2]; + int a = dst_argb[3]; + int sb = (b * matrix_argb[0] + g * matrix_argb[1] + + r * matrix_argb[2] + a * matrix_argb[3]) >> 7; + int sg = (b * matrix_argb[4] + g * matrix_argb[5] + + r * matrix_argb[6] + a * matrix_argb[7]) >> 7; + int sr = (b * matrix_argb[8] + g * matrix_argb[9] + + r * matrix_argb[10] + a * matrix_argb[11]) >> 7; + if (sb < 0) { + sb = 0; + } + if (sb > 255) { + sb = 255; + } + if (sg < 0) { + sg = 0; + } + if (sg > 255) { + sg = 255; + } + if (sr < 0) { + sr = 0; + } + if (sr > 255) { + sr = 255; + } + dst_argb[0] = sb; + dst_argb[1] = sg; + dst_argb[2] = sr; + dst_argb += 4; + } +} + +// Apply color table to a row of image. +void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) { + for (int x = 0; x < width; ++x) { + int b = dst_argb[0]; + int g = dst_argb[1]; + int r = dst_argb[2]; + int a = dst_argb[3]; + dst_argb[0] = table_argb[b * 4 + 0]; + dst_argb[1] = table_argb[g * 4 + 1]; + dst_argb[2] = table_argb[r * 4 + 2]; + dst_argb[3] = table_argb[a * 4 + 3]; + dst_argb += 4; + } +} + +void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size, + int interval_offset, int width) { + for (int x = 0; x < width; ++x) { + int b = dst_argb[0]; + int g = dst_argb[1]; + int r = dst_argb[2]; + dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset; + dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset; + dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset; + dst_argb += 4; + } +} + +void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) { + // Copy a Y to RGB. + for (int x = 0; x < width; ++x) { + uint8 y = src_y[0]; + dst_argb[2] = dst_argb[1] = dst_argb[0] = y; + dst_argb[3] = 255u; + dst_argb += 4; + ++src_y; + } +} + +// C reference code that mimics the YUV assembly. + +#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */ + +#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */ +#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */ +#define UR 0 + +#define VB 0 +#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */ +#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */ + +// Bias +#define BB UB * 128 + VB * 128 +#define BG UG * 128 + VG * 128 +#define BR UR * 128 + VR * 128 + +static __inline uint32 Clip(int32 val) { + if (val < 0) { + return static_cast<uint32>(0); + } else if (val > 255) { + return static_cast<uint32>(255); + } + return static_cast<uint32>(val); +} + +static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, uint8* rgb_buf, + int ashift, int rshift, int gshift, int bshift) { + int32 y1 = (static_cast<int32>(y) - 16) * YG; + uint32 b = Clip(static_cast<int32>((u * UB + v * VB) - (BB) + y1) >> 6); + uint32 g = Clip(static_cast<int32>((u * UG + v * VG) - (BG) + y1) >> 6); + uint32 r = Clip(static_cast<int32>((u * UR + v * VR) - (BR) + y1) >> 6); + *reinterpret_cast<uint32*>(rgb_buf) = (b << bshift) | + (g << gshift) | + (r << rshift) | + (255u << ashift); +} + +static __inline void YuvPixel2(uint8 y, uint8 u, uint8 v, + uint8* b, uint8* g, uint8* r) { + int32 y1 = (static_cast<int32>(y) - 16) * YG; + *b = Clip(static_cast<int32>((u * UB + v * VB) - (BB) + y1) >> 6); + *g = Clip(static_cast<int32>((u * UG + v * VG) - (BG) + y1) >> 6); + *r = Clip(static_cast<int32>((u * UR + v * VR) - (BR) + y1) >> 6); +} + +void I444ToARGBRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + for (int x = 0; x < width; ++x) { + YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf, 24, 16, 8, 0); + y_buf += 1; + u_buf += 1; + v_buf += 1; + rgb_buf += 4; // Advance 1 pixel. + } +} + +// Also used for 420 +void I422ToARGBRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + for (int x = 0; x < width - 1; x += 2) { + YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0); + YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0); + y_buf += 2; + u_buf += 1; + v_buf += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0); + } +} + +void I422ToRGB24Row_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + for (int x = 0; x < width - 1; x += 2) { + YuvPixel2(y_buf[0], u_buf[0], v_buf[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + YuvPixel2(y_buf[1], u_buf[0], v_buf[0], + rgb_buf + 3, rgb_buf + 4, rgb_buf + 5); + y_buf += 2; + u_buf += 1; + v_buf += 1; + rgb_buf += 6; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel2(y_buf[0], u_buf[0], v_buf[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + } +} + +void I422ToRAWRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + for (int x = 0; x < width - 1; x += 2) { + YuvPixel2(y_buf[0], u_buf[0], v_buf[0], + rgb_buf + 2, rgb_buf + 1, rgb_buf + 0); + YuvPixel2(y_buf[1], u_buf[0], v_buf[0], + rgb_buf + 5, rgb_buf + 4, rgb_buf + 3); + y_buf += 2; + u_buf += 1; + v_buf += 1; + rgb_buf += 6; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel2(y_buf[0], u_buf[0], v_buf[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + } +} + +void I411ToARGBRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + for (int x = 0; x < width - 3; x += 4) { + YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0); + YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0); + YuvPixel(y_buf[2], u_buf[0], v_buf[0], rgb_buf + 8, 24, 16, 8, 0); + YuvPixel(y_buf[3], u_buf[0], v_buf[0], rgb_buf + 12, 24, 16, 8, 0); + y_buf += 4; + u_buf += 1; + v_buf += 1; + rgb_buf += 16; // Advance 4 pixels. + } + if (width & 2) { + YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0); + YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0); + y_buf += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0); + } +} + +void NV12ToARGBRow_C(const uint8* y_buf, + const uint8* uv_buf, + uint8* rgb_buf, + int width) { + for (int x = 0; x < width - 1; x += 2) { + YuvPixel(y_buf[0], uv_buf[0], uv_buf[1], rgb_buf + 0, 24, 16, 8, 0); + YuvPixel(y_buf[1], uv_buf[0], uv_buf[1], rgb_buf + 4, 24, 16, 8, 0); + y_buf += 2; + uv_buf += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(y_buf[0], uv_buf[0], uv_buf[1], rgb_buf + 0, 24, 16, 8, 0); + } +} + +void NV21ToARGBRow_C(const uint8* y_buf, + const uint8* vu_buf, + uint8* rgb_buf, + int width) { + for (int x = 0; x < width - 1; x += 2) { + YuvPixel(y_buf[0], vu_buf[1], vu_buf[0], rgb_buf + 0, 24, 16, 8, 0); + YuvPixel(y_buf[1], vu_buf[1], vu_buf[0], rgb_buf + 4, 24, 16, 8, 0); + y_buf += 2; + vu_buf += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(y_buf[0], vu_buf[1], vu_buf[0], rgb_buf + 0, 24, 16, 8, 0); + } +} + +void I422ToBGRARow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + for (int x = 0; x < width - 1; x += 2) { + YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 0, 8, 16, 24); + YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 0, 8, 16, 24); + y_buf += 2; + u_buf += 1; + v_buf += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf, 0, 8, 16, 24); + } +} + +void I422ToABGRRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + for (int x = 0; x < width - 1; x += 2) { + YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 0, 8, 16); + YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 0, 8, 16); + y_buf += 2; + u_buf += 1; + v_buf += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 0, 8, 16); + } +} + +void I422ToRGBARow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + for (int x = 0; x < width - 1; x += 2) { + YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 0, 24, 16, 8); + YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 0, 24, 16, 8); + y_buf += 2; + u_buf += 1; + v_buf += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 0, 24, 16, 8); + } +} + +void YToARGBRow_C(const uint8* y_buf, uint8* rgb_buf, int width) { + for (int x = 0; x < width; ++x) { + YuvPixel(y_buf[0], 128, 128, rgb_buf, 24, 16, 8, 0); + y_buf += 1; + rgb_buf += 4; // Advance 1 pixel. + } +} + +void MirrorRow_C(const uint8* src, uint8* dst, int width) { + src += width - 1; + for (int x = 0; x < width - 1; x += 2) { + dst[x] = src[0]; + dst[x + 1] = src[-1]; + src -= 2; + } + if (width & 1) { + dst[width - 1] = src[0]; + } +} + +void MirrorRowUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { + src_uv += (width - 1) << 1; + for (int x = 0; x < width - 1; x += 2) { + dst_u[x] = src_uv[0]; + dst_u[x + 1] = src_uv[-2]; + dst_v[x] = src_uv[1]; + dst_v[x + 1] = src_uv[-2 + 1]; + src_uv -= 4; + } + if (width & 1) { + dst_u[width - 1] = src_uv[0]; + dst_v[width - 1] = src_uv[1]; + } +} + +void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) { + const uint32* src32 = reinterpret_cast<const uint32*>(src); + uint32* dst32 = reinterpret_cast<uint32*>(dst); + src32 += width - 1; + for (int x = 0; x < width - 1; x += 2) { + dst32[x] = src32[0]; + dst32[x + 1] = src32[-1]; + src32 -= 2; + } + if (width & 1) { + dst32[width - 1] = src32[0]; + } +} + +void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { + for (int x = 0; x < width - 1; x += 2) { + dst_u[x] = src_uv[0]; + dst_u[x + 1] = src_uv[2]; + dst_v[x] = src_uv[1]; + dst_v[x + 1] = src_uv[3]; + src_uv += 4; + } + if (width & 1) { + dst_u[width - 1] = src_uv[0]; + dst_v[width - 1] = src_uv[1]; + } +} + +void CopyRow_C(const uint8* src, uint8* dst, int count) { + memcpy(dst, src, count); +} + +void SetRow8_C(uint8* dst, uint32 v8, int count) { +#ifdef _MSC_VER + // VC will generate rep stosb. + for (int x = 0; x < count; ++x) { + dst[x] = v8; + } +#else + memset(dst, v8, count); +#endif +} + +void SetRows32_C(uint8* dst, uint32 v32, int width, + int dst_stride, int height) { + for (int y = 0; y < height; ++y) { + uint32* d = reinterpret_cast<uint32*>(dst); + for (int x = 0; x < width; ++x) { + d[x] = v32; + } + dst += dst_stride; + } +} + +// Filter 2 rows of YUY2 UV's (422) into U and V (420). +void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_u, uint8* dst_v, int width) { + // Output a row of UV values, filtering 2 rows of YUY2. + for (int x = 0; x < width; x += 2) { + dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1; + dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1; + src_yuy2 += 4; + dst_u += 1; + dst_v += 1; + } +} + +// Copy row of YUY2 UV's (422) into U and V (422). +void YUY2ToUV422Row_C(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int width) { + // Output a row of UV values. + for (int x = 0; x < width; x += 2) { + dst_u[0] = src_yuy2[1]; + dst_v[0] = src_yuy2[3]; + src_yuy2 += 4; + dst_u += 1; + dst_v += 1; + } +} + +// Copy row of YUY2 Y's (422) into Y (420/422). +void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) { + // Output a row of Y values. + for (int x = 0; x < width - 1; x += 2) { + dst_y[x] = src_yuy2[0]; + dst_y[x + 1] = src_yuy2[2]; + src_yuy2 += 4; + } + if (width & 1) { + dst_y[width - 1] = src_yuy2[0]; + } +} + +// Filter 2 rows of UYVY UV's (422) into U and V (420). +void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_u, uint8* dst_v, int width) { + // Output a row of UV values. + for (int x = 0; x < width; x += 2) { + dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1; + dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1; + src_uyvy += 4; + dst_u += 1; + dst_v += 1; + } +} + +// Copy row of UYVY UV's (422) into U and V (422). +void UYVYToUV422Row_C(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int width) { + // Output a row of UV values. + for (int x = 0; x < width; x += 2) { + dst_u[0] = src_uyvy[0]; + dst_v[0] = src_uyvy[2]; + src_uyvy += 4; + dst_u += 1; + dst_v += 1; + } +} + +// Copy row of UYVY Y's (422) into Y (420/422). +void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) { + // Output a row of Y values. + for (int x = 0; x < width - 1; x += 2) { + dst_y[x] = src_uyvy[1]; + dst_y[x + 1] = src_uyvy[3]; + src_uyvy += 4; + } + if (width & 1) { + dst_y[width - 1] = src_uyvy[1]; + } +} + +#define BLEND(f, b, a) (((256 - a) * b) >> 8) + f + +// Blend src_argb0 over src_argb1 and store to dst_argb. +// dst_argb may be src_argb0 or src_argb1. +// This code mimics the SSSE3 version for better testability. +void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + for (int x = 0; x < width - 1; x += 2) { + uint32 fb = src_argb0[0]; + uint32 fg = src_argb0[1]; + uint32 fr = src_argb0[2]; + uint32 a = src_argb0[3]; + uint32 bb = src_argb1[0]; + uint32 bg = src_argb1[1]; + uint32 br = src_argb1[2]; + dst_argb[0] = BLEND(fb, bb, a); + dst_argb[1] = BLEND(fg, bg, a); + dst_argb[2] = BLEND(fr, br, a); + dst_argb[3] = 255u; + + fb = src_argb0[4 + 0]; + fg = src_argb0[4 + 1]; + fr = src_argb0[4 + 2]; + a = src_argb0[4 + 3]; + bb = src_argb1[4 + 0]; + bg = src_argb1[4 + 1]; + br = src_argb1[4 + 2]; + dst_argb[4 + 0] = BLEND(fb, bb, a); + dst_argb[4 + 1] = BLEND(fg, bg, a); + dst_argb[4 + 2] = BLEND(fr, br, a); + dst_argb[4 + 3] = 255u; + src_argb0 += 8; + src_argb1 += 8; + dst_argb += 8; + } + + if (width & 1) { + uint32 fb = src_argb0[0]; + uint32 fg = src_argb0[1]; + uint32 fr = src_argb0[2]; + uint32 a = src_argb0[3]; + uint32 bb = src_argb1[0]; + uint32 bg = src_argb1[1]; + uint32 br = src_argb1[2]; + dst_argb[0] = BLEND(fb, bb, a); + dst_argb[1] = BLEND(fg, bg, a); + dst_argb[2] = BLEND(fr, br, a); + dst_argb[3] = 255u; + } +} +#undef BLEND +#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24 + +// Multiply source RGB by alpha and store to destination. +// This code mimics the SSSE3 version for better testability. +void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { + for (int i = 0; i < width - 1; i += 2) { + uint32 b = src_argb[0]; + uint32 g = src_argb[1]; + uint32 r = src_argb[2]; + uint32 a = src_argb[3]; + dst_argb[0] = ATTENUATE(b, a); + dst_argb[1] = ATTENUATE(g, a); + dst_argb[2] = ATTENUATE(r, a); + dst_argb[3] = a; + b = src_argb[4]; + g = src_argb[5]; + r = src_argb[6]; + a = src_argb[7]; + dst_argb[4] = ATTENUATE(b, a); + dst_argb[5] = ATTENUATE(g, a); + dst_argb[6] = ATTENUATE(r, a); + dst_argb[7] = a; + src_argb += 8; + dst_argb += 8; + } + + if (width & 1) { + const uint32 b = src_argb[0]; + const uint32 g = src_argb[1]; + const uint32 r = src_argb[2]; + const uint32 a = src_argb[3]; + dst_argb[0] = ATTENUATE(b, a); + dst_argb[1] = ATTENUATE(g, a); + dst_argb[2] = ATTENUATE(r, a); + dst_argb[3] = a; + } +} +#undef ATTENUATE + +// Divide source RGB by alpha and store to destination. +// b = (b * 255 + (a / 2)) / a; +// g = (g * 255 + (a / 2)) / a; +// r = (r * 255 + (a / 2)) / a; +// Reciprocal method is off by 1 on some values. ie 125 +// 8.16 fixed point inverse table +#define T(a) 0x10000 / a +uint32 fixed_invtbl8[256] = { + 0x0100, T(0x01), T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07), + T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f), + T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17), + T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f), + T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27), + T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f), + T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37), + T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f), + T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47), + T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f), + T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57), + T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f), + T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67), + T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f), + T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77), + T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f), + T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87), + T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f), + T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97), + T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f), + T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7), + T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf), + T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7), + T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf), + T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7), + T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf), + T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7), + T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf), + T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7), + T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef), + T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7), + T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x0100 }; +#undef T + +void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { + for (int i = 0; i < width; ++i) { + uint32 b = src_argb[0]; + uint32 g = src_argb[1]; + uint32 r = src_argb[2]; + const uint32 a = src_argb[3]; + if (a) { + const uint32 ia = fixed_invtbl8[a]; // 8.16 fixed point + b = (b * ia) >> 8; + g = (g * ia) >> 8; + r = (r * ia) >> 8; + // Clamping should not be necessary but is free in assembly. + if (b > 255) { + b = 255; + } + if (g > 255) { + g = 255; + } + if (r > 255) { + r = 255; + } + } + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = a; + src_argb += 4; + dst_argb += 4; + } +} + +// Wrappers to handle odd width +#define YANY(NAMEANY, I420TORGB_SSE, I420TORGB_C, UV_SHIFT) \ + void NAMEANY(const uint8* y_buf, \ + const uint8* u_buf, \ + const uint8* v_buf, \ + uint8* rgb_buf, \ + int width) { \ + int n = width & ~7; \ + I420TORGB_SSE(y_buf, u_buf, v_buf, rgb_buf, n); \ + I420TORGB_C(y_buf + n, \ + u_buf + (n >> UV_SHIFT), \ + v_buf + (n >> UV_SHIFT), \ + rgb_buf + n * 4, width & 7); \ + } + +// Wrappers to handle odd width +#define Y2NY(NAMEANY, NV12TORGB_SSE, NV12TORGB_C, UV_SHIFT) \ + void NAMEANY(const uint8* y_buf, \ + const uint8* uv_buf, \ + uint8* rgb_buf, \ + int width) { \ + int n = width & ~7; \ + NV12TORGB_SSE(y_buf, uv_buf, rgb_buf, n); \ + NV12TORGB_C(y_buf + n, \ + uv_buf + (n >> UV_SHIFT), \ + rgb_buf + n * 4, width & 7); \ + } + + +#ifdef HAS_I422TOARGBROW_SSSE3 +YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C, 0) +YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C, 1) +YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C, 2) +Y2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_Unaligned_SSSE3, NV12ToARGBRow_C, 0) +Y2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C, 0) +YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, 1) +YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, 1) +#endif +#ifdef HAS_I422TORGB24ROW_SSSE3 +YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_Unaligned_SSSE3, \ + I422ToRGB24Row_C, 1) +YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_Unaligned_SSSE3, I422ToRAWRow_C, 1) +#endif +#ifdef HAS_I422TORGBAROW_SSSE3 +YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C, 1) +#endif +#ifdef HAS_I422TOARGBROW_NEON +YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1) +YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C, 1) +YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C, 1) +YANY(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, I422ToRGBARow_C, 1) +Y2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0) +Y2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0) +YANY(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, I422ToRGB24Row_C, 1) +YANY(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, I422ToRAWRow_C, 1) +#endif +#undef YANY + +#define RGBANY(NAMEANY, ARGBTORGB, BPP) \ + void NAMEANY(const uint8* argb_buf, \ + uint8* rgb_buf, \ + int width) { \ + SIMD_ALIGNED(uint8 row[kMaxStride]); \ + ARGBTORGB(argb_buf, row, width); \ + memcpy(rgb_buf, row, width * BPP); \ + } + +#if defined(HAS_ARGBTORGB24ROW_SSSE3) +RGBANY(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 3) +RGBANY(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 3) +RGBANY(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 2) +RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 2) +RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 2) +#endif +#if defined(HAS_ARGBTORGB24ROW_NEON) +RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 3) +RGBANY(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 3) +#endif +#undef RGBANY + +#define YANY(NAMEANY, ARGBTOY_SSE, BPP) \ + void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \ + ARGBTOY_SSE(src_argb, dst_y, width - 16); \ + ARGBTOY_SSE(src_argb + (width - 16) * BPP, dst_y + (width - 16), 16); \ + } + +#ifdef HAS_ARGBTOYROW_SSSE3 +YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4) +YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4) +YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4) +#endif +#ifdef HAS_RGBATOYROW_SSSE3 +YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4) +#endif +#ifdef HAS_YUY2TOYROW_SSE2 +YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2) +YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2) +#endif +#ifdef HAS_YUY2TOYROW_NEON +YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2) +YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2) +#endif +#undef YANY + +#define UVANY(NAMEANY, ANYTOUV_SSE, ANYTOUV_C, BPP) \ + void NAMEANY(const uint8* src_argb, int src_stride_argb, \ + uint8* dst_u, uint8* dst_v, int width) { \ + int n = width & ~15; \ + ANYTOUV_SSE(src_argb, src_stride_argb, dst_u, dst_v, n); \ + ANYTOUV_C(src_argb + n * BPP, src_stride_argb, \ + dst_u + (n >> 1), \ + dst_v + (n >> 1), \ + width & 15); \ + } + +#ifdef HAS_ARGBTOUVROW_SSSE3 +UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4) +UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4) +UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4) +#endif +#ifdef HAS_RGBATOYROW_SSSE3 +UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4) +#endif +#ifdef HAS_YUY2TOUVROW_SSE2 +UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2) +UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2) +#endif +#ifdef HAS_YUY2TOUVROW_NEON +UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2) +UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2) +#endif +#undef UVANY + +#define UV422ANY(NAMEANY, ANYTOUV_SSE, ANYTOUV_C, BPP) \ + void NAMEANY(const uint8* src_argb, \ + uint8* dst_u, uint8* dst_v, int width) { \ + int n = width & ~15; \ + ANYTOUV_SSE(src_argb, dst_u, dst_v, n); \ + ANYTOUV_C(src_argb + n * BPP, \ + dst_u + (n >> 1), \ + dst_v + (n >> 1), \ + width & 15); \ + } + +#ifdef HAS_YUY2TOUV422ROW_SSE2 +UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_Unaligned_SSE2, \ + YUY2ToUV422Row_C, 2) +UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_Unaligned_SSE2, \ + UYVYToUV422Row_C, 2) +#endif +#ifdef HAS_YUY2TOUV422ROW_NEON +UV422ANY(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, \ + YUY2ToUV422Row_C, 2) +UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, \ + UYVYToUV422Row_C, 2) +#endif +#undef UV422ANY + +void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum, + const int32* previous_cumsum, int width) { + int32 row_sum[4] = {0, 0, 0, 0}; + for (int x = 0; x < width; ++x) { + row_sum[0] += row[x * 4 + 0]; + row_sum[1] += row[x * 4 + 1]; + row_sum[2] += row[x * 4 + 2]; + row_sum[3] += row[x * 4 + 3]; + cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0]; + cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1]; + cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2]; + cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3]; + } +} + +void CumulativeSumToAverage_C(const int32* tl, const int32* bl, + int w, int area, uint8* dst, int count) { + float ooa = 1.0f / area; + for (int i = 0; i < count; ++i) { + dst[0] = static_cast<uint8>((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa); + dst[1] = static_cast<uint8>((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa); + dst[2] = static_cast<uint8>((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa); + dst[3] = static_cast<uint8>((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa); + dst += 4; + tl += 4; + bl += 4; + } +} + +#define REPEAT8(v) (v) | ((v) << 8) +#define SHADE(f, v) v * f >> 24 + +void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value) { + const uint32 b_scale = REPEAT8(value & 0xff); + const uint32 g_scale = REPEAT8((value >> 8) & 0xff); + const uint32 r_scale = REPEAT8((value >> 16) & 0xff); + const uint32 a_scale = REPEAT8(value >> 24); + + for (int i = 0; i < width; ++i) { + const uint32 b = REPEAT8(src_argb[0]); + const uint32 g = REPEAT8(src_argb[1]); + const uint32 r = REPEAT8(src_argb[2]); + const uint32 a = REPEAT8(src_argb[3]); + dst_argb[0] = SHADE(b, b_scale); + dst_argb[1] = SHADE(g, g_scale); + dst_argb[2] = SHADE(r, r_scale); + dst_argb[3] = SHADE(a, a_scale); + src_argb += 4; + dst_argb += 4; + } +} +#undef REPEAT8 +#undef SHADE + +// Copy pixels from rotated source to destination row with a slope. +LIBYUV_API +void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, + uint8* dst_argb, const float* uv_dudv, int width) { + // Render a row of pixels from source into a buffer. + float uv[2]; + uv[0] = uv_dudv[0]; + uv[1] = uv_dudv[1]; + for (int i = 0; i < width; ++i) { + int x = static_cast<int>(uv[0]); + int y = static_cast<int>(uv[1]); + *reinterpret_cast<uint32*>(dst_argb) = + *reinterpret_cast<const uint32*>(src_argb + y * src_argb_stride + + x * 4); + dst_argb += 4; + uv[0] += uv_dudv[2]; + uv[1] += uv_dudv[3]; + } +} + +// C version 2x2 -> 2x1. +void ARGBInterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, + int dst_width, int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint8* src_ptr1 = src_ptr + src_stride; + uint8* end = dst_ptr + (dst_width << 2); + do { + dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; + dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8; + dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8; + dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8; + dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8; + dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8; + dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8; + dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8; + src_ptr += 8; + src_ptr1 += 8; + dst_ptr += 8; + } while (dst_ptr < end); +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/files/source/row_neon.cc b/files/source/row_neon.cc new file mode 100644 index 00000000..19a78330 --- /dev/null +++ b/files/source/row_neon.cc @@ -0,0 +1,829 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC Neon +#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__) + +// Read 8 Y, 4 U and 4 V from 422 +#define READYUV422 \ + "vld1.u8 {d0}, [%0]! \n" \ + "vld1.u32 {d2[0]}, [%1]! \n" \ + "vld1.u32 {d2[1]}, [%2]! \n" + +// Read 8 Y and 4 UV from NV12 +#define READNV12 \ + "vld1.u8 {d0}, [%0]! \n" \ + "vld1.u8 {d2}, [%1]! \n" \ + "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ + "vuzp.u8 d2, d3 \n" \ + "vtrn.u32 d2, d3 \n" \ + +// Read 8 Y and 4 VU from NV21 +#define READNV21 \ + "vld1.u8 {d0}, [%0]! \n" \ + "vld1.u8 {d2}, [%1]! \n" \ + "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ + "vuzp.u8 d3, d2 \n" \ + "vtrn.u32 d2, d3 \n" \ + +#define YUV422TORGB \ + "veor.u8 d2, d26 \n"/*subtract 128 from u and v*/\ + "vmull.s8 q8, d2, d24 \n"/* u/v B/R component */\ + "vmull.s8 q9, d2, d25 \n"/* u/v G component */\ + "vmov.u8 d1, #0 \n"/* split odd/even y apart */\ + "vtrn.u8 d0, d1 \n" \ + "vsub.s16 q0, q0, q15 \n"/* offset y */\ + "vmul.s16 q0, q0, q14 \n" \ + "vadd.s16 d18, d19 \n" \ + "vqadd.s16 d20, d0, d16 \n" \ + "vqadd.s16 d21, d1, d16 \n" \ + "vqadd.s16 d22, d0, d17 \n" \ + "vqadd.s16 d23, d1, d17 \n" \ + "vqadd.s16 d16, d0, d18 \n" \ + "vqadd.s16 d17, d1, d18 \n" \ + "vqrshrun.s16 d0, q10, #6 \n" \ + "vqrshrun.s16 d1, q11, #6 \n" \ + "vqrshrun.s16 d2, q8, #6 \n" \ + "vmovl.u8 q10, d0 \n"/* set up for reinterleave*/\ + "vmovl.u8 q11, d1 \n" \ + "vmovl.u8 q8, d2 \n" \ + "vtrn.u8 d20, d21 \n" \ + "vtrn.u8 d22, d23 \n" \ + "vtrn.u8 d16, d17 \n" \ + "vmov.u8 d21, d16 \n" + +#if defined(HAS_I422TOARGBROW_NEON) || defined(HAS_I422TOBGRAROW_NEON) || \ + defined(HAS_I422TOABGRROW_NEON) || defined(HAS_I422TORGBAROW_NEON) +static const vec8 kUVToRB = { 127, 127, 127, 127, 102, 102, 102, 102, + 0, 0, 0, 0, 0, 0, 0, 0 }; +static const vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52, + 0, 0, 0, 0, 0, 0, 0, 0 }; +#endif + +#ifdef HAS_I422TOARGBROW_NEON +void I422ToARGBRow_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + asm volatile ( + "vld1.u8 {d24}, [%5] \n" + "vld1.u8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" + : "+r"(y_buf), // %0 + "+r"(u_buf), // %1 + "+r"(v_buf), // %2 + "+r"(rgb_buf), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_I422TOARGBROW_NEON + +#ifdef HAS_I422TOBGRAROW_NEON +void I422ToBGRARow_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + asm volatile ( + "vld1.u8 {d24}, [%5] \n" + "vld1.u8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vswp.u8 d20, d22 \n" + "vmov.u8 d19, #255 \n" + "vst4.8 {d19, d20, d21, d22}, [%3]! \n" + "bgt 1b \n" + : "+r"(y_buf), // %0 + "+r"(u_buf), // %1 + "+r"(v_buf), // %2 + "+r"(rgb_buf), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_I422TOBGRAROW_NEON + +#ifdef HAS_I422TOABGRROW_NEON +void I422ToABGRRow_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + asm volatile ( + "vld1.u8 {d24}, [%5] \n" + "vld1.u8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vswp.u8 d20, d22 \n" + "vmov.u8 d23, #255 \n" + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" + : "+r"(y_buf), // %0 + "+r"(u_buf), // %1 + "+r"(v_buf), // %2 + "+r"(rgb_buf), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_I422TOABGRROW_NEON + +#ifdef HAS_I422TORGBAROW_NEON +void I422ToRGBARow_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + asm volatile ( + "vld1.u8 {d24}, [%5] \n" + "vld1.u8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vmov.u8 d19, #255 \n" + "vst4.8 {d19, d20, d21, d22}, [%3]! \n" + "bgt 1b \n" + : "+r"(y_buf), // %0 + "+r"(u_buf), // %1 + "+r"(v_buf), // %2 + "+r"(rgb_buf), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_I422TORGBAROW_NEON + +#ifdef HAS_I422TORGB24ROW_NEON +void I422ToRGB24Row_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + asm volatile ( + "vld1.u8 {d24}, [%5] \n" + "vld1.u8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vst3.8 {d20, d21, d22}, [%3]! \n" + "bgt 1b \n" + : "+r"(y_buf), // %0 + "+r"(u_buf), // %1 + "+r"(v_buf), // %2 + "+r"(rgb_buf), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_I422TORGB24ROW_NEON + +#ifdef HAS_I422TORAWROW_NEON +void I422ToRAWRow_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + asm volatile ( + "vld1.u8 {d24}, [%5] \n" + "vld1.u8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vswp.u8 d20, d22 \n" + "vst3.8 {d20, d21, d22}, [%3]! \n" + "bgt 1b \n" + : "+r"(y_buf), // %0 + "+r"(u_buf), // %1 + "+r"(v_buf), // %2 + "+r"(rgb_buf), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_I422TORAWROW_NEON + +#ifdef HAS_NV12TOARGBROW_NEON +void NV12ToARGBRow_NEON(const uint8* y_buf, + const uint8* uv_buf, + uint8* rgb_buf, + int width) { + asm volatile ( + "vld1.u8 {d24}, [%4] \n" + "vld1.u8 {d25}, [%5] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READNV12 + YUV422TORGB + "subs %3, %3, #8 \n" + "vmov.u8 d23, #255 \n" + "vst4.8 {d20, d21, d22, d23}, [%2]! \n" + "bgt 1b \n" + : "+r"(y_buf), // %0 + "+r"(uv_buf), // %1 + "+r"(rgb_buf), // %2 + "+r"(width) // %3 + : "r"(&kUVToRB), // %4 + "r"(&kUVToG) // %5 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_NV12TOARGBROW_NEON + +#ifdef HAS_NV21TOARGBROW_NEON +void NV21ToARGBRow_NEON(const uint8* y_buf, + const uint8* uv_buf, + uint8* rgb_buf, + int width) { + asm volatile ( + "vld1.u8 {d24}, [%4] \n" + "vld1.u8 {d25}, [%5] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READNV21 + YUV422TORGB + "subs %3, %3, #8 \n" + "vmov.u8 d23, #255 \n" + "vst4.8 {d20, d21, d22, d23}, [%2]! \n" + "bgt 1b \n" + : "+r"(y_buf), // %0 + "+r"(uv_buf), // %1 + "+r"(rgb_buf), // %2 + "+r"(width) // %3 + : "r"(&kUVToRB), // %4 + "r"(&kUVToG) // %5 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_NV21TOARGBROW_NEON + +#ifdef HAS_SPLITUV_NEON +// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v +// Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels. +void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + "vld2.u8 {q0, q1}, [%0]! \n" // load 16 pairs of UV + "subs %3, %3, #16 \n" // 16 processed per loop + "vst1.u8 {q0}, [%1]! \n" // store U + "vst1.u8 {q1}, [%2]! \n" // Store V + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "memory", "cc", "q0", "q1" // Clobber List + ); +} +#endif // HAS_SPLITUV_NEON + +#ifdef HAS_COPYROW_NEON +// Copy multiple of 64 +void CopyRow_NEON(const uint8* src, uint8* dst, int count) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + "vldm %0!, {q0, q1, q2, q3} \n" // load 64 + "subs %2, %2, #64 \n" // 64 processed per loop + "vstm %1!, {q0, q1, q2, q3} \n" // store 64 + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(count) // %2 // Output registers + : // Input registers + : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List + ); +} +#endif // HAS_COPYROW_NEON + +#ifdef HAS_SETROW_NEON +// SetRow8 writes 'count' bytes using a 32 bit value repeated. +void SetRow8_NEON(uint8* dst, uint32 v32, int count) { + asm volatile ( // NOLINT + "vdup.u32 q0, %2 \n" // duplicate 4 ints + "1: \n" + "subs %1, %1, #16 \n" // 16 bytes per loop + "vst1.u32 {q0}, [%0]! \n" // store + "bgt 1b \n" + : "+r"(dst), // %0 + "+r"(count) // %1 + : "r"(v32) // %2 + : "q0", "memory", "cc"); +} + +// TODO(fbarchard): Make fully assembler +// SetRow32 writes 'count' words using a 32 bit value repeated. +void SetRows32_NEON(uint8* dst, uint32 v32, int width, + int dst_stride, int height) { + for (int y = 0; y < height; ++y) { + SetRow8_NEON(dst, v32, width << 2); + dst += dst_stride; + } +} +#endif // HAS_SETROW_NEON + +#ifdef HAS_MIRRORROW_NEON +void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { + asm volatile ( + // compute where to start writing destination + "add %1, %2 \n" + // work on segments that are multiples of 16 + "lsrs r3, %2, #4 \n" + // the output is written in two block. 8 bytes followed + // by another 8. reading is done sequentially, from left to + // right. writing is done from right to left in block sizes + // %1, the destination pointer is incremented after writing + // the first of the two blocks. need to subtract that 8 off + // along with 16 to get the next location. + "mov r3, #-24 \n" + "beq 2f \n" + + // back of destination by the size of the register that is + // going to be mirrored + "sub %1, #16 \n" + // the loop needs to run on blocks of 16. what will be left + // over is either a negative number, the residuals that need + // to be done, or 0. If this isn't subtracted off here the + // loop will run one extra time. + "sub %2, #16 \n" + + // mirror the bytes in the 64 bit segments. unable to mirror + // the bytes in the entire 128 bits in one go. + // because of the inability to mirror the entire 128 bits + // mirror the writing out of the two 64 bit segments. + ".p2align 2 \n" + "1: \n" + "vld1.8 {q0}, [%0]! \n" // src += 16 + "subs %2, #16 \n" + "vrev64.8 q0, q0 \n" + "vst1.8 {d1}, [%1]! \n" + "vst1.8 {d0}, [%1], r3 \n" // dst -= 16 + "bge 1b \n" + + // add 16 back to the counter. if the result is 0 there is no + // residuals so jump past + "adds %2, #16 \n" + "beq 5f \n" + "add %1, #16 \n" + "2: \n" + "mov r3, #-3 \n" + "sub %1, #2 \n" + "subs %2, #2 \n" + // check for 16*n+1 scenarios where segments_of_2 should not + // be run, but there is something left over. + "blt 4f \n" + +// do this in neon registers as per +// http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/ + "3: \n" + "vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2 + "subs %2, #2 \n" + "vst1.8 {d1[0]}, [%1]! \n" + "vst1.8 {d0[0]}, [%1], r3 \n" // dst -= 2 + "bge 3b \n" + + "adds %2, #2 \n" + "beq 5f \n" + "4: \n" + "add %1, #1 \n" + "vld1.8 {d0[0]}, [%0] \n" + "vst1.8 {d0[0]}, [%1] \n" + "5: \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "r3", "q0" + ); +} +#endif // HAS_MIRRORROW_NEON + +#ifdef HAS_MIRRORROWUV_NEON +void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) { + asm volatile ( + // compute where to start writing destination + "add %1, %3 \n" // dst_a + width + "add %2, %3 \n" // dst_b + width + // work on input segments that are multiples of 16, but + // width that has been passed is output segments, half + // the size of input. + "lsrs r12, %3, #3 \n" + "beq 2f \n" + // the output is written in to two blocks. + "mov r12, #-8 \n" + // back of destination by the size of the register that is + // going to be mirrord + "sub %1, #8 \n" + "sub %2, #8 \n" + // the loop needs to run on blocks of 8. what will be left + // over is either a negative number, the residuals that need + // to be done, or 0. if this isn't subtracted off here the + // loop will run one extra time. + "sub %3, #8 \n" + + // mirror the bytes in the 64 bit segments + ".p2align 2 \n" + "1: \n" + "vld2.8 {d0, d1}, [%0]! \n" // src += 16 + "subs %3, #8 \n" + "vrev64.8 q0, q0 \n" + "vst1.8 {d0}, [%1], r12 \n" // dst_a -= 8 + "vst1.8 {d1}, [%2], r12 \n" // dst_b -= 8 + "bge 1b \n" + + // add 8 back to the counter. if the result is 0 there is no + // residuals so return + "adds %3, #8 \n" + "beq 4f \n" + "add %1, #8 \n" + "add %2, #8 \n" + "2: \n" + "mov r12, #-1 \n" + "sub %1, #1 \n" + "sub %2, #1 \n" + "3: \n" + "vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2 + "subs %3, %3, #1 \n" + "vst1.8 {d0[0]}, [%1], r12 \n" // dst_a -= 1 + "vst1.8 {d1[0]}, [%2], r12 \n" // dst_b -= 1 + "bgt 3b \n" + "4: \n" + : "+r"(src), // %0 + "+r"(dst_a), // %1 + "+r"(dst_b), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "r12", "q0" + ); +} +#endif // HAS_MIRRORROWUV_NEON + +#ifdef HAS_BGRATOARGBROW_NEON +void BGRAToARGBRow_NEON(const uint8* src_bgra, uint8* dst_argb, int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d2 \n" // swap G, R + "vswp.u8 d0, d3 \n" // swap B, A + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_bgra), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "d0", "d1", "d2", "d3" // Clobber List + ); +} +#endif // HAS_BGRATOARGBROW_NEON + +#ifdef HAS_ABGRTOARGBROW_NEON +void ABGRToARGBRow_NEON(const uint8* src_abgr, uint8* dst_argb, int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d0, d2 \n" // swap R, B + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "d0", "d1", "d2", "d3" // Clobber List + ); +} +#endif // HAS_ABGRTOARGBROW_NEON + +#ifdef HAS_RGBATOARGBROW_NEON +void RGBAToARGBRow_NEON(const uint8* src_rgba, uint8* dst_argb, int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmov.u8 d4, d0 \n" // move A after RGB + "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "d0", "d1", "d2", "d3", "d4" // Clobber List + ); +} +#endif // HAS_RGBATOARGBROW_NEON + +#ifdef HAS_RGB24TOARGBROW_NEON +void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { + asm volatile ( + "vmov.u8 d4, #255 \n" // Alpha + ".p2align 2 \n" + "1: \n" + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "d1", "d2", "d3", "d4" // Clobber List + ); +} +#endif // HAS_RGB24TOARGBROW_NEON + +#ifdef HAS_RAWTOARGBROW_NEON +void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { + asm volatile ( + "vmov.u8 d4, #255 \n" // Alpha + ".p2align 2 \n" + "1: \n" + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "d1", "d2", "d3", "d4" // Clobber List + ); +} +#endif // HAS_RAWTOARGBROW_NEON + +#ifdef HAS_ARGBTORGBAROW_NEON +void ARGBToRGBARow_NEON(const uint8* src_argb, uint8* dst_rgba, int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmov.u8 d0, d4 \n" // move A before RGB. + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of RGBA. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgba), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "d0", "d1", "d2", "d3", "d4" // Clobber List + ); +} +#endif // HAS_ARGBTORGBAROW_NEON + +#ifdef HAS_ARGBTORGB24ROW_NEON +void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb24), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "d1", "d2", "d3", "d4" // Clobber List + ); +} +#endif // HAS_ARGBTORGB24ROW_NEON + +#ifdef HAS_ARGBTORAWROW_NEON +void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_raw), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "d1", "d2", "d3", "d4" // Clobber List + ); +} +#endif // HAS_ARGBTORAWROW_NEON + +#ifdef HAS_YUY2TOYROW_NEON +void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + "vld2.u8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. + "subs %2, %2, #16 \n" // 16 processed per loop. + "vst1.u8 {q0}, [%1]! \n" // store 16 pixels of Y. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "q0", "q1" // Clobber List + ); +} +#endif // HAS_YUY2TOYROW_NEON + +#ifdef HAS_UYVYTOYROW_NEON +void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + "vld2.u8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. + "subs %2, %2, #16 \n" // 16 processed per loop. + "vst1.u8 {q1}, [%1]! \n" // store 16 pixels of Y. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "q0", "q1" // Clobber List + ); +} +#endif // HAS_UYVYTOYROW_NEON + +#ifdef HAS_YUY2TOYROW_NEON +void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + "vst1.u8 {d1}, [%1]! \n" // store 8 U. + "vst1.u8 {d3}, [%2]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "memory", "cc", "d0", "d1", "d2", "d3" // Clobber List + ); +} +#endif // HAS_YUY2TOYROW_NEON + +#ifdef HAS_UYVYTOYROW_NEON +void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + "vst1.u8 {d0}, [%1]! \n" // store 8 U. + "vst1.u8 {d2}, [%2]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "memory", "cc", "d0", "d1", "d2", "d3" // Clobber List + ); +} +#endif // HAS_UYVYTOYROW_NEON + +#ifdef HAS_YUY2TOYROW_NEON +void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "adds %1, %0, %1 \n" // stride + src_yuy2 + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. + "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. + "vrhadd.u8 d1, d1, d5 \n" // average rows of U + "vrhadd.u8 d3, d3, d7 \n" // average rows of V + "vst1.u8 {d1}, [%2]! \n" // store 8 U. + "vst1.u8 {d3}, [%3]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(stride_yuy2), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "memory", "cc", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List + ); +} +#endif // HAS_YUY2TOYROW_NEON + +#ifdef HAS_UYVYTOYROW_NEON +void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "adds %1, %0, %1 \n" // stride + src_uyvy + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. + "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. + "vrhadd.u8 d0, d0, d4 \n" // average rows of U + "vrhadd.u8 d2, d2, d6 \n" // average rows of V + "vst1.u8 {d0}, [%2]! \n" // store 8 U. + "vst1.u8 {d2}, [%3]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(stride_uyvy), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "memory", "cc", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List + ); +} +#endif // HAS_UYVYTOYROW_NEON + +#endif // __ARM_NEON__ + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/files/source/row_posix.cc b/files/source/row_posix.cc index 88ce475b..33149dad 100644 --- a/files/source/row_posix.cc +++ b/files/source/row_posix.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -8,652 +8,3655 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "row.h" +#include "libyuv/row.h" +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { extern "C" { +#endif + +// This module is for GCC x86 and x64 +#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__)) + +// GCC 4.2 on OSX has link error when passing static or const to inline. +// TODO(fbarchard): Use static const when gcc 4.2 support is dropped. +#ifdef __APPLE__ +#define CONST +#else +#define CONST static const +#endif #ifdef HAS_ARGBTOYROW_SSSE3 -// Constant multiplication table for converting ARGB to I400. -extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400[16]) = { - 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u +// Constants for ARGB +CONST vec8 kARGBToY = { + 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 +}; + +CONST vec8 kARGBToU = { + 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 +}; + +CONST vec8 kARGBToV = { + -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, +}; + +// Constants for BGRA +CONST vec8 kBGRAToY = { + 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 +}; + +CONST vec8 kBGRAToU = { + 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 +}; + +CONST vec8 kBGRAToV = { + 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 +}; + +// Constants for ABGR +CONST vec8 kABGRToY = { + 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 +}; + +CONST vec8 kABGRToU = { + -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 +}; + +CONST vec8 kABGRToV = { + 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 +}; + +CONST uvec8 kAddY16 = { + 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u }; -extern "C" TALIGN16(const uint8, kAdd16[16]) = { - 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u +CONST uvec8 kAddUV128 = { + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u }; -// Shuffle table for converting BG24 to ARGB. -extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = { +// Shuffle table for converting RGB24 to ARGB. +CONST uvec8 kShuffleMaskRGB24ToARGB = { 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u }; // Shuffle table for converting RAW to ARGB. -extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = { +CONST uvec8 kShuffleMaskRAWToARGB = { 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u }; -void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { - asm volatile( - "movdqa (%3),%%xmm7\n" - "movdqa (%4),%%xmm6\n" - "movdqa %%xmm6,%%xmm5\n" - "psllw $0x4,%%xmm5\n" // Generate a mask of 0x10 on each byte. -"1:" - "movdqa (%0),%%xmm0\n" - "pmaddubsw %%xmm7,%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "psrlw $0x7,%%xmm0\n" - "pmaddubsw %%xmm7,%%xmm1\n" - "lea 0x20(%0),%0\n" - "psrlw $0x7,%%xmm1\n" - "packuswb %%xmm1,%%xmm0\n" - "pmaddubsw %%xmm6,%%xmm0\n" - "packuswb %%xmm0,%%xmm0\n" - "paddb %%xmm5,%%xmm0\n" - "movq %%xmm0,(%1)\n" - "lea 0x8(%1),%1\n" - "sub $0x8,%2\n" - "ja 1b\n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(pix) // %2 - : "r"(kMultiplyMaskARGBToI400), // %3 - "r"(kAdd16) // %4 - : "memory" -); -} -#endif - -#ifdef HAS_BG24TOARGBROW_SSSE3 -void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) { - asm volatile( - "pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000 - "pslld $0x18,%%xmm7\n" - "movdqa (%3),%%xmm6\n" -"1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "movdqa 0x20(%0),%%xmm3\n" - "lea 0x30(%0),%0\n" - "movdqa %%xmm3,%%xmm2\n" - "palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] } - "pshufb %%xmm6,%%xmm2\n" - "por %%xmm7,%%xmm2\n" - "palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] } - "pshufb %%xmm6,%%xmm0\n" - "movdqa %%xmm2,0x20(%1)\n" - "por %%xmm7,%%xmm0\n" - "pshufb %%xmm6,%%xmm1\n" - "movdqa %%xmm0,(%1)\n" - "por %%xmm7,%%xmm1\n" - "palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] } - "pshufb %%xmm6,%%xmm3\n" - "movdqa %%xmm1,0x10(%1)\n" - "por %%xmm7,%%xmm3\n" - "movdqa %%xmm3,0x30(%1)\n" - "lea 0x40(%1),%1\n" - "sub $0x10,%2\n" - "ja 1b\n" - : "+r"(src_bg24), // %0 +// Shuffle table for converting ABGR to ARGB. +CONST uvec8 kShuffleMaskABGRToARGB = { + 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u +}; + +// Shuffle table for converting BGRA to ARGB. +CONST uvec8 kShuffleMaskBGRAToARGB = { + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u +}; + +// Shuffle table for converting RGBA to ARGB. +CONST uvec8 kShuffleMaskRGBAToARGB = { + 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u +}; + +// Shuffle table for converting ARGB to RGBA. +CONST uvec8 kShuffleMaskARGBToRGBA = { + 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u +}; + +// Shuffle table for converting ARGB to RGB24. +CONST uvec8 kShuffleMaskARGBToRGB24 = { + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u +}; + +// Shuffle table for converting ARGB to RAW. +CONST uvec8 kShuffleMaskARGBToRAW = { + 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u +}; + +void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" + ".p2align 4 \n" + "1: \n" + "movq (%0),%%xmm0 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm1 \n" + "por %%xmm5,%%xmm0 \n" + "por %%xmm5,%%xmm1 \n" + "movdqa %%xmm0,(%1) \n" + "movdqa %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_y), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 - : "r"(kShuffleMaskBG24ToARGB) // %3 - : "memory" -); + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) { + asm volatile ( + "movdqa %3,%%xmm5 \n" + "sub %0,%1 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0,(%0,%1,1) \n" + "lea 0x10(%0),%0 \n" + "jg 1b \n" + + : "+r"(src_abgr), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "m"(kShuffleMaskABGRToARGB) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm5" +#endif + ); +} + +void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) { + asm volatile ( + "movdqa %3,%%xmm5 \n" + "sub %0,%1 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0,(%0,%1,1) \n" + "lea 0x10(%0),%0 \n" + "jg 1b \n" + : "+r"(src_bgra), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "m"(kShuffleMaskBGRAToARGB) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm5" +#endif + ); +} + +void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) { + asm volatile ( + "movdqa %3,%%xmm5 \n" + "sub %0,%1 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0,(%0,%1,1) \n" + "lea 0x10(%0),%0 \n" + "jg 1b \n" + + : "+r"(src_rgba), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "m"(kShuffleMaskRGBAToARGB) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm5" +#endif + ); +} + +void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) { + asm volatile ( + "movdqa %3,%%xmm5 \n" + "sub %0,%1 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0,(%0,%1,1) \n" + "lea 0x10(%0),%0 \n" + "jg 1b \n" + + : "+r"(src_argb), // %0 + "+r"(dst_rgba), // %1 + "+r"(pix) // %2 + : "m"(kShuffleMaskARGBToRGBA) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm5" +#endif + ); +} + +void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 + "pslld $0x18,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm3 \n" + "lea 0x30(%0),%0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "palignr $0x8,%%xmm1,%%xmm2 \n" + "pshufb %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm2 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqa %%xmm2,0x20(%1) \n" + "por %%xmm5,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "movdqa %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "palignr $0x4,%%xmm3,%%xmm3 \n" + "pshufb %%xmm4,%%xmm3 \n" + "movdqa %%xmm1,0x10(%1) \n" + "por %%xmm5,%%xmm3 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm3,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "jg 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "m"(kShuffleMaskRGB24ToARGB) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); } void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { - asm volatile( - "pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000 - "pslld $0x18,%%xmm7\n" - "movdqa (%3),%%xmm6\n" -"1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "movdqa 0x20(%0),%%xmm3\n" - "lea 0x30(%0),%0\n" - "movdqa %%xmm3,%%xmm2\n" - "palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] } - "pshufb %%xmm6,%%xmm2\n" - "por %%xmm7,%%xmm2\n" - "palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] } - "pshufb %%xmm6,%%xmm0\n" - "movdqa %%xmm2,0x20(%1)\n" - "por %%xmm7,%%xmm0\n" - "pshufb %%xmm6,%%xmm1\n" - "movdqa %%xmm0,(%1)\n" - "por %%xmm7,%%xmm1\n" - "palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] } - "pshufb %%xmm6,%%xmm3\n" - "movdqa %%xmm1,0x10(%1)\n" - "por %%xmm7,%%xmm3\n" - "movdqa %%xmm3,0x30(%1)\n" - "lea 0x40(%1),%1\n" - "sub $0x10,%2\n" - "ja 1b\n" + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 + "pslld $0x18,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm3 \n" + "lea 0x30(%0),%0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "palignr $0x8,%%xmm1,%%xmm2 \n" + "pshufb %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm2 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqa %%xmm2,0x20(%1) \n" + "por %%xmm5,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "movdqa %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "palignr $0x4,%%xmm3,%%xmm3 \n" + "pshufb %%xmm4,%%xmm3 \n" + "movdqa %%xmm1,0x10(%1) \n" + "por %%xmm5,%%xmm3 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm3,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "jg 1b \n" : "+r"(src_raw), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 - : "r"(kShuffleMaskRAWToARGB) // %3 - : "memory" -); + : "m"(kShuffleMaskRAWToARGB) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); } + +void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { + asm volatile ( + "mov $0x1080108,%%eax \n" + "movd %%eax,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "mov $0x20802080,%%eax \n" + "movd %%eax,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psllw $0xb,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0xa,%%xmm4 \n" + "psrlw $0x5,%%xmm4 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "psllw $0x8,%%xmm7 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" + ".p2align 4 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm3,%%xmm1 \n" + "psllw $0xb,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "psllw $0x8,%%xmm1 \n" + "por %%xmm2,%%xmm1 \n" + "pand %%xmm4,%%xmm0 \n" + "pmulhuw %%xmm6,%%xmm0 \n" + "por %%xmm7,%%xmm0 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,(%1,%0,2) \n" + "movdqa %%xmm2,0x10(%1,%0,2) \n" + "lea 0x10(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "eax" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" #endif + ); +} -#if defined(__x86_64__) +void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { + asm volatile ( + "mov $0x1080108,%%eax \n" + "movd %%eax,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "mov $0x42004200,%%eax \n" + "movd %%eax,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psllw $0xb,%%xmm3 \n" + "movdqa %%xmm3,%%xmm4 \n" + "psrlw $0x6,%%xmm4 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "psllw $0x8,%%xmm7 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" + ".p2align 4 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psllw $0x1,%%xmm1 \n" + "psllw $0xb,%%xmm2 \n" + "pand %%xmm3,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "psllw $0x8,%%xmm1 \n" + "por %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm4,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "pmulhuw %%xmm6,%%xmm0 \n" + "pand %%xmm7,%%xmm2 \n" + "por %%xmm2,%%xmm0 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,(%1,%0,2) \n" + "movdqa %%xmm2,0x10(%1,%0,2) \n" + "lea 0x10(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "eax" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} -// 64 bit linux gcc version - -void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi - const uint8* u_buf, // rsi - const uint8* v_buf, // rdx - uint8* rgb_buf, // rcx - int width) { // r8 - asm volatile( -"1:" - "movzb (%1),%%r10\n" - "lea 1(%1),%1\n" - "movzb (%2),%%r11\n" - "lea 1(%2),%2\n" - "movq 2048(%5,%%r10,8),%%xmm0\n" - "movzb (%0),%%r10\n" - "movq 4096(%5,%%r11,8),%%xmm1\n" - "movzb 0x1(%0),%%r11\n" - "paddsw %%xmm1,%%xmm0\n" - "movq (%5,%%r10,8),%%xmm2\n" - "lea 2(%0),%0\n" - "movq (%5,%%r11,8),%%xmm3\n" - "paddsw %%xmm0,%%xmm2\n" - "paddsw %%xmm0,%%xmm3\n" - "shufps $0x44,%%xmm3,%%xmm2\n" - "psraw $0x6,%%xmm2\n" - "packuswb %%xmm2,%%xmm2\n" - "movq %%xmm2,0x0(%3)\n" - "lea 8(%3),%3\n" - "sub $0x2,%4\n" - "ja 1b\n" - : "+r"(y_buf), // %0 - "+r"(u_buf), // %1 - "+r"(v_buf), // %2 - "+r"(rgb_buf), // %3 - "+r"(width) // %4 - : "r" (_kCoefficientsRgbY) // %5 - : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" -); -} - -void FastConvertYUVToBGRARow(const uint8* y_buf, // rdi - const uint8* u_buf, // rsi - const uint8* v_buf, // rdx - uint8* rgb_buf, // rcx - int width) { // r8 - asm volatile( -"1:" - "movzb (%1),%%r10\n" - "lea 1(%1),%1\n" - "movzb (%2),%%r11\n" - "lea 1(%2),%2\n" - "movq 2048(%5,%%r10,8),%%xmm0\n" - "movzb (%0),%%r10\n" - "movq 4096(%5,%%r11,8),%%xmm1\n" - "movzb 0x1(%0),%%r11\n" - "paddsw %%xmm1,%%xmm0\n" - "movq (%5,%%r10,8),%%xmm2\n" - "lea 2(%0),%0\n" - "movq (%5,%%r11,8),%%xmm3\n" - "paddsw %%xmm0,%%xmm2\n" - "paddsw %%xmm0,%%xmm3\n" - "shufps $0x44,%%xmm3,%%xmm2\n" - "psraw $0x6,%%xmm2\n" - "packuswb %%xmm2,%%xmm2\n" - "movq %%xmm2,0x0(%3)\n" - "lea 8(%3),%3\n" - "sub $0x2,%4\n" - "ja 1b\n" - : "+r"(y_buf), // %0 - "+r"(u_buf), // %1 - "+r"(v_buf), // %2 - "+r"(rgb_buf), // %3 - "+r"(width) // %4 - : "r" (_kCoefficientsBgraY) // %5 - : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" -); -} - -void FastConvertYUVToABGRRow(const uint8* y_buf, // rdi - const uint8* u_buf, // rsi - const uint8* v_buf, // rdx - uint8* rgb_buf, // rcx - int width) { // r8 - asm volatile( -"1:" - "movzb (%1),%%r10\n" - "lea 1(%1),%1\n" - "movzb (%2),%%r11\n" - "lea 1(%2),%2\n" - "movq 2048(%5,%%r10,8),%%xmm0\n" - "movzb (%0),%%r10\n" - "movq 4096(%5,%%r11,8),%%xmm1\n" - "movzb 0x1(%0),%%r11\n" - "paddsw %%xmm1,%%xmm0\n" - "movq (%5,%%r10,8),%%xmm2\n" - "lea 2(%0),%0\n" - "movq (%5,%%r11,8),%%xmm3\n" - "paddsw %%xmm0,%%xmm2\n" - "paddsw %%xmm0,%%xmm3\n" - "shufps $0x44,%%xmm3,%%xmm2\n" - "psraw $0x6,%%xmm2\n" - "packuswb %%xmm2,%%xmm2\n" - "movq %%xmm2,0x0(%3)\n" - "lea 8(%3),%3\n" - "sub $0x2,%4\n" - "ja 1b\n" - : "+r"(y_buf), // %0 - "+r"(u_buf), // %1 - "+r"(v_buf), // %2 - "+r"(rgb_buf), // %3 - "+r"(width) // %4 - : "r" (_kCoefficientsAbgrY) // %5 - : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" -); -} - -void FastConvertYUV444ToRGB32Row(const uint8* y_buf, // rdi - const uint8* u_buf, // rsi - const uint8* v_buf, // rdx - uint8* rgb_buf, // rcx - int width) { // r8 - asm volatile( -"1:" - "movzb (%1),%%r10\n" - "lea 1(%1),%1\n" - "movzb (%2),%%r11\n" - "lea 1(%2),%2\n" - "movq 2048(%5,%%r10,8),%%xmm0\n" - "movzb (%0),%%r10\n" - "movq 4096(%5,%%r11,8),%%xmm1\n" - "paddsw %%xmm1,%%xmm0\n" - "movq (%5,%%r10,8),%%xmm2\n" - "lea 1(%0),%0\n" - "paddsw %%xmm0,%%xmm2\n" - "shufps $0x44,%%xmm2,%%xmm2\n" - "psraw $0x6,%%xmm2\n" - "packuswb %%xmm2,%%xmm2\n" - "movd %%xmm2,0x0(%3)\n" - "lea 4(%3),%3\n" - "sub $0x1,%4\n" - "ja 1b\n" - : "+r"(y_buf), // %0 - "+r"(u_buf), // %1 - "+r"(v_buf), // %2 - "+r"(rgb_buf), // %3 - "+r"(width) // %4 - : "r" (_kCoefficientsRgbY) // %5 - : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2" -); -} - -void FastConvertYToRGB32Row(const uint8* y_buf, // rdi - uint8* rgb_buf, // rcx - int width) { // r8 - asm volatile( -"1:" - "movzb (%0),%%r10\n" - "movzb 0x1(%0),%%r11\n" - "movq (%3,%%r10,8),%%xmm2\n" - "lea 2(%0),%0\n" - "movq (%3,%%r11,8),%%xmm3\n" - "shufps $0x44,%%xmm3,%%xmm2\n" - "psraw $0x6,%%xmm2\n" - "packuswb %%xmm2,%%xmm2\n" - "movq %%xmm2,0x0(%1)\n" - "lea 8(%1),%1\n" - "sub $0x2,%2\n" - "ja 1b\n" +void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { + asm volatile ( + "mov $0xf0f0f0f,%%eax \n" + "movd %%eax,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "pslld $0x4,%%xmm5 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" + ".p2align 4 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm4,%%xmm0 \n" + "pand %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm3 \n" + "psllw $0x4,%%xmm1 \n" + "psrlw $0x4,%%xmm3 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm3,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,(%1,%0,2) \n" + "movdqa %%xmm1,0x10(%1,%0,2) \n" + "lea 0x10(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "eax" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) { + asm volatile ( + "movdqa %3,%%xmm6 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa 0x20(%0),%%xmm2 \n" + "movdqa 0x30(%0),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "pshufb %%xmm6,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "pshufb %%xmm6,%%xmm2 \n" + "pshufb %%xmm6,%%xmm3 \n" + "movdqa %%xmm1,%%xmm4 \n" + "psrldq $0x4,%%xmm1 \n" + "pslldq $0xc,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "por %%xmm4,%%xmm0 \n" + "pslldq $0x8,%%xmm5 \n" + "movdqa %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "psrldq $0x8,%%xmm2 \n" + "pslldq $0x4,%%xmm3 \n" + "por %%xmm3,%%xmm2 \n" + "movdqa %%xmm1,0x10(%1) \n" + "movdqa %%xmm2,0x20(%1) \n" + "lea 0x30(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(pix) // %2 + : "m"(kShuffleMaskARGBToRGB24) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} + +void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) { + asm volatile ( + "movdqa %3,%%xmm6 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa 0x20(%0),%%xmm2 \n" + "movdqa 0x30(%0),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "pshufb %%xmm6,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "pshufb %%xmm6,%%xmm2 \n" + "pshufb %%xmm6,%%xmm3 \n" + "movdqa %%xmm1,%%xmm4 \n" + "psrldq $0x4,%%xmm1 \n" + "pslldq $0xc,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "por %%xmm4,%%xmm0 \n" + "pslldq $0x8,%%xmm5 \n" + "movdqa %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "psrldq $0x8,%%xmm2 \n" + "pslldq $0x4,%%xmm3 \n" + "por %%xmm3,%%xmm2 \n" + "movdqa %%xmm1,0x10(%1) \n" + "movdqa %%xmm2,0x20(%1) \n" + "lea 0x30(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(pix) // %2 + : "m"(kShuffleMaskARGBToRAW) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} + +void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) { + asm volatile ( + "pcmpeqb %%xmm3,%%xmm3 \n" + "psrld $0x1b,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrld $0x1a,%%xmm4 \n" + "pslld $0x5,%%xmm4 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0xb,%%xmm5 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pslld $0x8,%%xmm0 \n" + "psrld $0x3,%%xmm1 \n" + "psrld $0x5,%%xmm2 \n" + "psrad $0x10,%%xmm0 \n" + "pand %%xmm3,%%xmm1 \n" + "pand %%xmm4,%%xmm2 \n" + "pand %%xmm5,%%xmm0 \n" + "por %%xmm2,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) { + asm volatile ( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrld $0x1b,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "pslld $0x5,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "pslld $0xa,%%xmm6 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "pslld $0xf,%%xmm7 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "psrad $0x10,%%xmm0 \n" + "psrld $0x3,%%xmm1 \n" + "psrld $0x6,%%xmm2 \n" + "psrld $0x9,%%xmm3 \n" + "pand %%xmm7,%%xmm0 \n" + "pand %%xmm4,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm6,%%xmm3 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm3,%%xmm2 \n" + "por %%xmm2,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} + +void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) { + asm volatile ( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0xc,%%xmm4 \n" + "movdqa %%xmm4,%%xmm3 \n" + "psrlw $0x8,%%xmm3 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm3,%%xmm0 \n" + "pand %%xmm4,%%xmm1 \n" + "psrlq $0x4,%%xmm0 \n" + "psrlq $0x8,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" +#endif + ); +} + +void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa 0x20(%0),%%xmm2 \n" + "movdqa 0x30(%0),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kARGBToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kARGBToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +// TODO(fbarchard): pass xmm constants to single block of assembly. +// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes +// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers, +// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around +// and considered unsafe. +void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kARGBToU), // %0 + "m"(kARGBToV), // %1 + "m"(kAddUV128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa 0x20(%0),%%xmm2 \n" + "movdqa 0x30(%0),%%xmm6 \n" + "pavgb (%0,%4,1),%%xmm0 \n" + "pavgb 0x10(%0,%4,1),%%xmm1 \n" + "pavgb 0x20(%0,%4,1),%%xmm2 \n" + "pavgb 0x30(%0,%4,1),%%xmm6 \n" + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"(static_cast<intptr_t>(src_stride_argb)) + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} + +void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kARGBToU), // %0 + "m"(kARGBToV), // %1 + "m"(kAddUV128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + ".p2align 4 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu (%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"(static_cast<intptr_t>(src_stride_argb)) + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} + +void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa 0x20(%0),%%xmm2 \n" + "movdqa 0x30(%0),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "jg 1b \n" + : "+r"(src_bgra), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kBGRAToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "jg 1b \n" + : "+r"(src_bgra), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kBGRAToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kBGRAToU), // %0 + "m"(kBGRAToV), // %1 + "m"(kAddUV128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa 0x20(%0),%%xmm2 \n" + "movdqa 0x30(%0),%%xmm6 \n" + "pavgb (%0,%4,1),%%xmm0 \n" + "pavgb 0x10(%0,%4,1),%%xmm1 \n" + "pavgb 0x20(%0,%4,1),%%xmm2 \n" + "pavgb 0x30(%0,%4,1),%%xmm6 \n" + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "jg 1b \n" + : "+r"(src_bgra0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"(static_cast<intptr_t>(src_stride_bgra)) + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} + +void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kBGRAToU), // %0 + "m"(kBGRAToV), // %1 + "m"(kAddUV128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + ".p2align 4 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu (%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "jg 1b \n" + : "+r"(src_bgra0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"(static_cast<intptr_t>(src_stride_bgra)) + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} + +void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa 0x20(%0),%%xmm2 \n" + "movdqa 0x30(%0),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "jg 1b \n" + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kABGRToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "jg 1b \n" + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kABGRToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kABGRToU), // %0 + "m"(kABGRToV), // %1 + "m"(kAddUV128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa 0x20(%0),%%xmm2 \n" + "movdqa 0x30(%0),%%xmm6 \n" + "pavgb (%0,%4,1),%%xmm0 \n" + "pavgb 0x10(%0,%4,1),%%xmm1 \n" + "pavgb 0x20(%0,%4,1),%%xmm2 \n" + "pavgb 0x30(%0,%4,1),%%xmm6 \n" + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "jg 1b \n" + : "+r"(src_abgr0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"(static_cast<intptr_t>(src_stride_abgr)) + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} + +void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kABGRToU), // %0 + "m"(kABGRToV), // %1 + "m"(kAddUV128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + ".p2align 4 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu (%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "jg 1b \n" + : "+r"(src_abgr0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"(static_cast<intptr_t>(src_stride_abgr)) + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} +#endif // HAS_ARGBTOYROW_SSSE3 + +#ifdef HAS_I422TOARGBROW_SSSE3 +#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */ +#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */ +#define UR 0 + +#define VB 0 +#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */ +#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */ + +// Bias +#define BB UB * 128 + VB * 128 +#define BG UG * 128 + VG * 128 +#define BR UR * 128 + VR * 128 + +#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */ + +struct { + vec8 kUVToB; // 0 + vec8 kUVToG; // 16 + vec8 kUVToR; // 32 + vec16 kUVBiasB; // 48 + vec16 kUVBiasG; // 64 + vec16 kUVBiasR; // 80 + vec16 kYSub16; // 96 + vec16 kYToRgb; // 112 + vec8 kVUToB; // 128 + vec8 kVUToG; // 144 + vec8 kVUToR; // 160 +} CONST SIMD_ALIGNED(kYuvConstants) = { + { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB }, + { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, + { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR }, + { BB, BB, BB, BB, BB, BB, BB, BB }, + { BG, BG, BG, BG, BG, BG, BG, BG }, + { BR, BR, BR, BR, BR, BR, BR, BR }, + { 16, 16, 16, 16, 16, 16, 16, 16 }, + { YG, YG, YG, YG, YG, YG, YG, YG }, + { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB }, + { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, + { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR } +}; + + +// Read 8 UV from 411 +#define READYUV444 \ + "movq (%[u_buf]),%%xmm0 \n" \ + "movq (%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x8(%[u_buf]),%[u_buf] \n" \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + +// Read 4 UV from 422, upsample to 8 UV +#define READYUV422 \ + "movd (%[u_buf]),%%xmm0 \n" \ + "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x4(%[u_buf]),%[u_buf] \n" \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" \ + +// Read 2 UV from 411, upsample to 8 UV +#define READYUV411 \ + "movd (%[u_buf]),%%xmm0 \n" \ + "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x2(%[u_buf]),%[u_buf] \n" \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" \ + "punpckldq %%xmm0,%%xmm0 \n" \ + +// Read 4 UV from NV12, upsample to 8 UV +#define READNV12 \ + "movq (%[uv_buf]),%%xmm0 \n" \ + "lea 0x8(%[uv_buf]),%[uv_buf] \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" \ + +// Convert 8 pixels: 8 UV and 8 Y +#define YUVTORGB \ + "movdqa %%xmm0,%%xmm1 \n" \ + "movdqa %%xmm0,%%xmm2 \n" \ + "pmaddubsw (%[kYuvConstants]),%%xmm0 \n" \ + "pmaddubsw 16(%[kYuvConstants]),%%xmm1 \n" \ + "pmaddubsw 32(%[kYuvConstants]),%%xmm2 \n" \ + "psubw 48(%[kYuvConstants]),%%xmm0 \n" \ + "psubw 64(%[kYuvConstants]),%%xmm1 \n" \ + "psubw 80(%[kYuvConstants]),%%xmm2 \n" \ + "movq (%[y_buf]),%%xmm3 \n" \ + "lea 0x8(%[y_buf]),%[y_buf] \n" \ + "punpcklbw %%xmm4,%%xmm3 \n" \ + "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \ + "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \ + "paddsw %%xmm3,%%xmm0 \n" \ + "paddsw %%xmm3,%%xmm1 \n" \ + "paddsw %%xmm3,%%xmm2 \n" \ + "psraw $0x6,%%xmm0 \n" \ + "psraw $0x6,%%xmm1 \n" \ + "psraw $0x6,%%xmm2 \n" \ + "packuswb %%xmm0,%%xmm0 \n" \ + "packuswb %%xmm1,%%xmm1 \n" \ + "packuswb %%xmm2,%%xmm2 \n" \ + +// Convert 8 pixels: 8 VU and 8 Y +#define YVUTORGB \ + "movdqa %%xmm0,%%xmm1 \n" \ + "movdqa %%xmm0,%%xmm2 \n" \ + "pmaddubsw 128(%[kYuvConstants]),%%xmm0 \n" \ + "pmaddubsw 144(%[kYuvConstants]),%%xmm1 \n" \ + "pmaddubsw 160(%[kYuvConstants]),%%xmm2 \n" \ + "psubw 48(%[kYuvConstants]),%%xmm0 \n" \ + "psubw 64(%[kYuvConstants]),%%xmm1 \n" \ + "psubw 80(%[kYuvConstants]),%%xmm2 \n" \ + "movq (%[y_buf]),%%xmm3 \n" \ + "lea 0x8(%[y_buf]),%[y_buf] \n" \ + "punpcklbw %%xmm4,%%xmm3 \n" \ + "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \ + "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \ + "paddsw %%xmm3,%%xmm0 \n" \ + "paddsw %%xmm3,%%xmm1 \n" \ + "paddsw %%xmm3,%%xmm2 \n" \ + "psraw $0x6,%%xmm0 \n" \ + "psraw $0x6,%%xmm1 \n" \ + "psraw $0x6,%%xmm2 \n" \ + "packuswb %%xmm0,%%xmm0 \n" \ + "packuswb %%xmm1,%%xmm1 \n" \ + "packuswb %%xmm2,%%xmm2 \n" \ + +void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + READYUV444 + YUVTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,(%[argb_buf]) \n" + "movdqa %%xmm1,0x10(%[argb_buf]) \n" + "lea 0x20(%[argb_buf]),%[argb_buf] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [argb_buf]"+r"(argb_buf), // %[argb_buf] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + READYUV422 + YUVTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,(%[argb_buf]) \n" + "movdqa %%xmm1,0x10(%[argb_buf]) \n" + "lea 0x20(%[argb_buf]),%[argb_buf] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [argb_buf]"+r"(argb_buf), // %[argb_buf] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + READYUV411 + YUVTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,(%[argb_buf]) \n" + "movdqa %%xmm1,0x10(%[argb_buf]) \n" + "lea 0x20(%[argb_buf]),%[argb_buf] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [argb_buf]"+r"(argb_buf), // %[argb_buf] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* argb_buf, + int width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + READNV12 + YUVTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,(%[argb_buf]) \n" + "movdqa %%xmm1,0x10(%[argb_buf]) \n" + "lea 0x20(%[argb_buf]),%[argb_buf] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [argb_buf]"+r"(argb_buf), // %[argb_buf] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* vu_buf, + uint8* argb_buf, + int width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + READNV12 + YVUTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,(%[argb_buf]) \n" + "movdqa %%xmm1,0x10(%[argb_buf]) \n" + "lea 0x20(%[argb_buf]),%[argb_buf] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(vu_buf), // %[uv_buf] + [argb_buf]"+r"(argb_buf), // %[argb_buf] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + READYUV444 + YUVTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,(%[argb_buf]) \n" + "movdqu %%xmm1,0x10(%[argb_buf]) \n" + "lea 0x20(%[argb_buf]),%[argb_buf] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [argb_buf]"+r"(argb_buf), // %[argb_buf] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + READYUV422 + YUVTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,(%[argb_buf]) \n" + "movdqu %%xmm1,0x10(%[argb_buf]) \n" + "lea 0x20(%[argb_buf]),%[argb_buf] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [argb_buf]"+r"(argb_buf), // %[argb_buf] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + READYUV411 + YUVTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,(%[argb_buf]) \n" + "movdqu %%xmm1,0x10(%[argb_buf]) \n" + "lea 0x20(%[argb_buf]),%[argb_buf] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [argb_buf]"+r"(argb_buf), // %[argb_buf] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* argb_buf, + int width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + READNV12 + YUVTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,(%[argb_buf]) \n" + "movdqu %%xmm1,0x10(%[argb_buf]) \n" + "lea 0x20(%[argb_buf]),%[argb_buf] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [argb_buf]"+r"(argb_buf), // %[argb_buf] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* vu_buf, + uint8* argb_buf, + int width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + READNV12 + YVUTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,(%[argb_buf]) \n" + "movdqu %%xmm1,0x10(%[argb_buf]) \n" + "lea 0x20(%[argb_buf]),%[argb_buf] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(vu_buf), // %[uv_buf] + [argb_buf]"+r"(argb_buf), // %[argb_buf] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* bgra_buf, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + READYUV422 + YUVTORGB + "pcmpeqb %%xmm5,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm5 \n" + "movdqa %%xmm5,%%xmm0 \n" + "punpcklwd %%xmm1,%%xmm5 \n" + "punpckhwd %%xmm1,%%xmm0 \n" + "movdqa %%xmm5,(%[argb_buf]) \n" + "movdqa %%xmm0,0x10(%[argb_buf]) \n" + "lea 0x20(%[argb_buf]),%[argb_buf] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [argb_buf]"+r"(bgra_buf), // %[argb_buf] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* abgr_buf, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + READYUV422 + YUVTORGB + "punpcklbw %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm2 \n" + "punpckhwd %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,(%[argb_buf]) \n" + "movdqa %%xmm1,0x10(%[argb_buf]) \n" + "lea 0x20(%[argb_buf]),%[argb_buf] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [argb_buf]"+r"(abgr_buf), // %[argb_buf] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* bgra_buf, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + READYUV422 + YUVTORGB + "pcmpeqb %%xmm5,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm5 \n" + "movdqa %%xmm5,%%xmm0 \n" + "punpcklwd %%xmm1,%%xmm5 \n" + "punpckhwd %%xmm1,%%xmm0 \n" + "movdqu %%xmm5,(%[argb_buf]) \n" + "movdqu %%xmm0,0x10(%[argb_buf]) \n" + "lea 0x20(%[argb_buf]),%[argb_buf] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [argb_buf]"+r"(bgra_buf), // %[argb_buf] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* abgr_buf, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + READYUV422 + YUVTORGB + "punpcklbw %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm2 \n" + "punpckhwd %%xmm0,%%xmm1 \n" + "movdqu %%xmm2,(%[argb_buf]) \n" + "movdqu %%xmm1,0x10(%[argb_buf]) \n" + "lea 0x20(%[argb_buf]),%[argb_buf] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [argb_buf]"+r"(abgr_buf), // %[argb_buf] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_I422TOARGBROW_SSSE3 + +#ifdef HAS_YTOARGBROW_SSE2 +void YToARGBRow_SSE2(const uint8* y_buf, + uint8* rgb_buf, + int width) { + asm volatile ( + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" + "mov $0x10001000,%%eax \n" + "movd %%eax,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "mov $0x012a012a,%%eax \n" + "movd %%eax,%%xmm2 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" + ".p2align 4 \n" + "1: \n" + // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 + "movq (%0),%%xmm0 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "psubusw %%xmm3,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + + // Step 2: Weave into ARGB + "punpcklbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "por %%xmm4,%%xmm1 \n" + "movdqa %%xmm0,(%1) \n" + "movdqa %%xmm1,16(%1) \n" + "lea 32(%1),%1 \n" + + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(y_buf), // %0 "+r"(rgb_buf), // %1 - "+r"(width) // %2 - : "r" (_kCoefficientsRgbY) // %3 - : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" -); -} - -#elif defined(__i386__) -// 32 bit gcc version - -void FastConvertYUVToRGB32Row(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - asm( - ".text\n" -#if defined(OSX) || defined(IOS) - ".globl _FastConvertYUVToRGB32Row\n" -"_FastConvertYUVToRGB32Row:\n" -#else - ".global FastConvertYUVToRGB32Row\n" -"FastConvertYUVToRGB32Row:\n" -#endif - "pusha\n" - "mov 0x24(%esp),%edx\n" - "mov 0x28(%esp),%edi\n" - "mov 0x2c(%esp),%esi\n" - "mov 0x30(%esp),%ebp\n" - "mov 0x34(%esp),%ecx\n" - -"1:" - "movzbl (%edi),%eax\n" - "lea 1(%edi),%edi\n" - "movzbl (%esi),%ebx\n" - "lea 1(%esi),%esi\n" - "movq _kCoefficientsRgbY+2048(,%eax,8),%mm0\n" - "movzbl (%edx),%eax\n" - "paddsw _kCoefficientsRgbY+4096(,%ebx,8),%mm0\n" - "movzbl 0x1(%edx),%ebx\n" - "movq _kCoefficientsRgbY(,%eax,8),%mm1\n" - "lea 2(%edx),%edx\n" - "movq _kCoefficientsRgbY(,%ebx,8),%mm2\n" - "paddsw %mm0,%mm1\n" - "paddsw %mm0,%mm2\n" - "psraw $0x6,%mm1\n" - "psraw $0x6,%mm2\n" - "packuswb %mm2,%mm1\n" - "movntq %mm1,0x0(%ebp)\n" - "lea 8(%ebp),%ebp\n" - "sub $0x2,%ecx\n" - "ja 1b\n" - "popa\n" - "ret\n" -); - -void FastConvertYUVToBGRARow(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - asm( - ".text\n" -#if defined(OSX) || defined(IOS) - ".globl _FastConvertYUVToBGRARow\n" -"_FastConvertYUVToBGRARow:\n" -#else - ".global FastConvertYUVToBGRARow\n" -"FastConvertYUVToBGRARow:\n" -#endif - "pusha\n" - "mov 0x24(%esp),%edx\n" - "mov 0x28(%esp),%edi\n" - "mov 0x2c(%esp),%esi\n" - "mov 0x30(%esp),%ebp\n" - "mov 0x34(%esp),%ecx\n" - -"1:" - "movzbl (%edi),%eax\n" - "lea 1(%edi),%edi\n" - "movzbl (%esi),%ebx\n" - "lea 1(%esi),%esi\n" - "movq _kCoefficientsBgraY+2048(,%eax,8),%mm0\n" - "movzbl (%edx),%eax\n" - "paddsw _kCoefficientsBgraY+4096(,%ebx,8),%mm0\n" - "movzbl 0x1(%edx),%ebx\n" - "movq _kCoefficientsBgraY(,%eax,8),%mm1\n" - "lea 2(%edx),%edx\n" - "movq _kCoefficientsBgraY(,%ebx,8),%mm2\n" - "paddsw %mm0,%mm1\n" - "paddsw %mm0,%mm2\n" - "psraw $0x6,%mm1\n" - "psraw $0x6,%mm2\n" - "packuswb %mm2,%mm1\n" - "movntq %mm1,0x0(%ebp)\n" - "lea 8(%ebp),%ebp\n" - "sub $0x2,%ecx\n" - "ja 1b\n" - "popa\n" - "ret\n" -); - -void FastConvertYUVToABGRRow(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - asm( - ".text\n" -#if defined(OSX) || defined(IOS) - ".globl _FastConvertYUVToABGRRow\n" -"_FastConvertYUVToABGRRow:\n" -#else - ".global FastConvertYUVToABGRRow\n" -"FastConvertYUVToABGRRow:\n" -#endif - "pusha\n" - "mov 0x24(%esp),%edx\n" - "mov 0x28(%esp),%edi\n" - "mov 0x2c(%esp),%esi\n" - "mov 0x30(%esp),%ebp\n" - "mov 0x34(%esp),%ecx\n" - -"1:" - "movzbl (%edi),%eax\n" - "lea 1(%edi),%edi\n" - "movzbl (%esi),%ebx\n" - "lea 1(%esi),%esi\n" - "movq _kCoefficientsAbgrY+2048(,%eax,8),%mm0\n" - "movzbl (%edx),%eax\n" - "paddsw _kCoefficientsAbgrY+4096(,%ebx,8),%mm0\n" - "movzbl 0x1(%edx),%ebx\n" - "movq _kCoefficientsAbgrY(,%eax,8),%mm1\n" - "lea 2(%edx),%edx\n" - "movq _kCoefficientsAbgrY(,%ebx,8),%mm2\n" - "paddsw %mm0,%mm1\n" - "paddsw %mm0,%mm2\n" - "psraw $0x6,%mm1\n" - "psraw $0x6,%mm2\n" - "packuswb %mm2,%mm1\n" - "movntq %mm1,0x0(%ebp)\n" - "lea 8(%ebp),%ebp\n" - "sub $0x2,%ecx\n" - "ja 1b\n" - "popa\n" - "ret\n" -); - -void FastConvertYUV444ToRGB32Row(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - asm( - ".text\n" -#if defined(OSX) || defined(IOS) - ".globl _FastConvertYUV444ToRGB32Row\n" -"_FastConvertYUV444ToRGB32Row:\n" -#else - ".global FastConvertYUV444ToRGB32Row\n" -"FastConvertYUV444ToRGB32Row:\n" -#endif - "pusha\n" - "mov 0x24(%esp),%edx\n" - "mov 0x28(%esp),%edi\n" - "mov 0x2c(%esp),%esi\n" - "mov 0x30(%esp),%ebp\n" - "mov 0x34(%esp),%ecx\n" - -"1:" - "movzbl (%edi),%eax\n" - "lea 1(%edi),%edi\n" - "movzbl (%esi),%ebx\n" - "lea 1(%esi),%esi\n" - "movq _kCoefficientsRgbY+2048(,%eax,8),%mm0\n" - "movzbl (%edx),%eax\n" - "paddsw _kCoefficientsRgbY+4096(,%ebx,8),%mm0\n" - "lea 1(%edx),%edx\n" - "paddsw _kCoefficientsRgbY(,%eax,8),%mm0\n" - "psraw $0x6,%mm0\n" - "packuswb %mm0,%mm0\n" - "movd %mm0,0x0(%ebp)\n" - "lea 4(%ebp),%ebp\n" - "sub $0x1,%ecx\n" - "ja 1b\n" - "popa\n" - "ret\n" -); - -void FastConvertYToRGB32Row(const uint8* y_buf, - uint8* rgb_buf, - int width); - asm( - ".text\n" -#if defined(OSX) || defined(IOS) - ".globl _FastConvertYToRGB32Row\n" -"_FastConvertYToRGB32Row:\n" -#else - ".global FastConvertYToRGB32Row\n" -"FastConvertYToRGB32Row:\n" -#endif - "push %ebx\n" - "mov 0x8(%esp),%eax\n" - "mov 0xc(%esp),%edx\n" - "mov 0x10(%esp),%ecx\n" - -"1:" - "movzbl (%eax),%ebx\n" - "movq _kCoefficientsRgbY(,%ebx,8),%mm0\n" - "psraw $0x6,%mm0\n" - "movzbl 0x1(%eax),%ebx\n" - "movq _kCoefficientsRgbY(,%ebx,8),%mm1\n" - "psraw $0x6,%mm1\n" - "packuswb %mm1,%mm0\n" - "lea 0x2(%eax),%eax\n" - "movq %mm0,(%edx)\n" - "lea 0x8(%edx),%edx\n" - "sub $0x2,%ecx\n" - "ja 1b\n" - "pop %ebx\n" - "ret\n" -); + "+rm"(width) // %2 + : + : "memory", "cc", "eax" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" +#endif + ); +} +#endif // HAS_YTOARGBROW_SSE2 -#else -// C reference code that mimic the YUV assembly. -#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x))) -#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \ - (((x) + (y)) > 32767 ? 32767 : ((x) + (y)))) - -static inline void YuvPixel(uint8 y, - uint8 u, - uint8 v, - uint8* rgb_buf, - int ashift, - int rshift, - int gshift, - int bshift) { - - int b = _kCoefficientsRgbY[256+u][0]; - int g = _kCoefficientsRgbY[256+u][1]; - int r = _kCoefficientsRgbY[256+u][2]; - int a = _kCoefficientsRgbY[256+u][3]; - - b = paddsw(b, _kCoefficientsRgbY[512+v][0]); - g = paddsw(g, _kCoefficientsRgbY[512+v][1]); - r = paddsw(r, _kCoefficientsRgbY[512+v][2]); - a = paddsw(a, _kCoefficientsRgbY[512+v][3]); - - b = paddsw(b, _kCoefficientsRgbY[y][0]); - g = paddsw(g, _kCoefficientsRgbY[y][1]); - r = paddsw(r, _kCoefficientsRgbY[y][2]); - a = paddsw(a, _kCoefficientsRgbY[y][3]); - - b >>= 6; - g >>= 6; - r >>= 6; - a >>= 6; - - *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b) << bshift) | - (packuswb(g) << gshift) | - (packuswb(r) << rshift) | - (packuswb(a) << ashift); -} - -void FastConvertYUVToRGB32Row(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { - for (int x = 0; x < width; x += 2) { - uint8 u = u_buf[x >> 1]; - uint8 v = v_buf[x >> 1]; - uint8 y0 = y_buf[x]; - YuvPixel(y0, u, v, rgb_buf, 24, 16, 8, 0); - if ((x + 1) < width) { - uint8 y1 = y_buf[x + 1]; - YuvPixel(y1, u, v, rgb_buf + 4, 24, 16, 8, 0); - } - rgb_buf += 8; // Advance 2 pixels. - } +#ifdef HAS_MIRRORROW_SSSE3 +// Shuffle table for reversing the bytes. +CONST uvec8 kShuffleMirror = { + 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u +}; + +void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { + intptr_t temp_width = static_cast<intptr_t>(width); + asm volatile ( + "movdqa %3,%%xmm5 \n" + "lea -0x10(%0),%0 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0,%2),%%xmm0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirror) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm5" +#endif + ); } +#endif // HAS_MIRRORROW_SSSE3 -void FastConvertYUVToBGRARow(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { - for (int x = 0; x < width; x += 2) { - uint8 u = u_buf[x >> 1]; - uint8 v = v_buf[x >> 1]; - uint8 y0 = y_buf[x]; - YuvPixel(y0, u, v, rgb_buf, 0, 8, 16, 24); - if ((x + 1) < width) { - uint8 y1 = y_buf[x + 1]; - YuvPixel(y1, u, v, rgb_buf + 4, 0, 8, 16, 24); - } - rgb_buf += 8; // Advance 2 pixels. +#ifdef HAS_MIRRORROW_SSE2 +void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { + intptr_t temp_width = static_cast<intptr_t>(width); + asm volatile ( + "lea -0x10(%0),%0 \n" + ".p2align 4 \n" + "1: \n" + "movdqu (%0,%2),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "psllw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "pshuflw $0x1b,%%xmm0,%%xmm0 \n" + "pshufhw $0x1b,%%xmm0,%%xmm0 \n" + "pshufd $0x4e,%%xmm0,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} +#endif // HAS_MIRRORROW_SSE2 + +#ifdef HAS_MIRRORROW_UV_SSSE3 +// Shuffle table for reversing the bytes of UV channels. +CONST uvec8 kShuffleMirrorUV = { + 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u +}; +void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, + int width) { + intptr_t temp_width = static_cast<intptr_t>(width); + asm volatile ( + "movdqa %4,%%xmm1 \n" + "lea -16(%0,%3,2),%0 \n" + "sub %1,%2 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "lea -16(%0),%0 \n" + "pshufb %%xmm1,%%xmm0 \n" + "sub $8,%3 \n" + "movlpd %%xmm0,(%1) \n" + "movhpd %%xmm0,(%1,%2) \n" + "lea 8(%1),%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(temp_width) // %3 + : "m"(kShuffleMirrorUV) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} +#endif // HAS_MIRRORROW_UV_SSSE3 + +#ifdef HAS_ARGBMIRRORROW_SSSE3 +// Shuffle table for reversing the bytes. +CONST uvec8 kARGBShuffleMirror = { + 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u +}; + +void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { + intptr_t temp_width = static_cast<intptr_t>(width); + asm volatile ( + "movdqa %3,%%xmm5 \n" + "lea -0x10(%0),%0 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0,%2,4),%%xmm0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kARGBShuffleMirror) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm5" +#endif + ); +} +#endif // HAS_ARGBMIRRORROW_SSSE3 + +#ifdef HAS_SPLITUV_SSE2 +void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "psrlw $0x8,%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "movdqa %%xmm0,(%1) \n" + "movdqa %%xmm2,(%1,%2) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} +#endif // HAS_SPLITUV_SSE2 + +#ifdef HAS_COPYROW_SSE2 +void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { + asm volatile ( + "sub %0,%1 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa %%xmm0,(%0,%1) \n" + "movdqa %%xmm1,0x10(%0,%1) \n" + "lea 0x20(%0),%0 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(count) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} +#endif // HAS_COPYROW_SSE2 + +#ifdef HAS_COPYROW_X86 +void CopyRow_X86(const uint8* src, uint8* dst, int width) { + size_t width_tmp = static_cast<size_t>(width); + asm volatile ( + "shr $0x2,%2 \n" + "rep movsl \n" + : "+S"(src), // %0 + "+D"(dst), // %1 + "+c"(width_tmp) // %2 + : + : "memory", "cc" + ); +} +#endif // HAS_COPYROW_X86 + +#ifdef HAS_SETROW_X86 +void SetRow8_X86(uint8* dst, uint32 v32, int width) { + size_t width_tmp = static_cast<size_t>(width); + asm volatile ( + "shr $0x2,%1 \n" + "rep stosl \n" + : "+D"(dst), // %0 + "+c"(width_tmp) // %1 + : "a"(v32) // %2 + : "memory", "cc"); +} + +void SetRows32_X86(uint8* dst, uint32 v32, int width, + int dst_stride, int height) { + for (int y = 0; y < height; ++y) { + size_t width_tmp = static_cast<size_t>(width); + uint32* d = reinterpret_cast<uint32*>(dst); + asm volatile ( + "rep stosl \n" + : "+D"(d), // %0 + "+c"(width_tmp) // %1 + : "a"(v32) // %2 + : "memory", "cc"); + dst += dst_stride; } } +#endif // HAS_SETROW_X86 + +#ifdef HAS_YUY2TOYROW_SSE2 +void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa (%0,%4,1),%%xmm2 \n" + "movdqa 0x10(%0,%4,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,(%1,%2) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : "r"(static_cast<intptr_t>(stride_yuy2)) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} + +void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,(%1,%2) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, + uint8* dst_y, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + ".p2align 4 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, + int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + ".p2align 4 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu (%0,%4,1),%%xmm2 \n" + "movdqu 0x10(%0,%4,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,(%1,%2) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : "r"(static_cast<intptr_t>(stride_yuy2)) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} + +void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + ".p2align 4 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,(%1,%2) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { + asm volatile ( + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} + +void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa (%0,%4,1),%%xmm2 \n" + "movdqa 0x10(%0,%4,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,(%1,%2) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : "r"(static_cast<intptr_t>(stride_uyvy)) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} + +void UYVYToUV422Row_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,(%1,%2) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, + uint8* dst_y, int pix) { + asm volatile ( + ".p2align 4 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} + +void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + ".p2align 4 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu (%0,%4,1),%%xmm2 \n" + "movdqu 0x10(%0,%4,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,(%1,%2) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : "r"(static_cast<intptr_t>(stride_uyvy)) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} + +void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + ".p2align 4 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,(%1,%2) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} +#endif // HAS_YUY2TOYROW_SSE2 + +#ifdef HAS_ARGBBLENDROW_SSE2 +// Blend 8 pixels at a time. +void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $0xf,%%xmm7 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x8,%%xmm6 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psllw $0x8,%%xmm5 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" + "sub $0x1,%3 \n" + "je 91f \n" + "jl 99f \n" + + // 1 pixel loop until destination pointer is aligned. + "10: \n" + "test $0xf,%2 \n" + "je 19f \n" + "movd (%0),%%xmm3 \n" + "lea 0x4(%0),%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movd (%1),%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "pshufhw $0xf5,%%xmm3,%%xmm3 \n" + "pshuflw $0xf5,%%xmm3,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movd (%1),%%xmm1 \n" + "lea 0x4(%1),%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "sub $0x1,%3 \n" + "movd %%xmm0,(%2) \n" + "lea 0x4(%2),%2 \n" + "jge 10b \n" + + "19: \n" + "add $1-4,%3 \n" + "jl 49f \n" -void FastConvertYUVToABGRRow(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, + // 4 pixel loop. + ".p2align 2 \n" + "41: \n" + "movdqu (%0),%%xmm3 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movdqu (%1),%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "pshufhw $0xf5,%%xmm3,%%xmm3 \n" + "pshuflw $0xf5,%%xmm3,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movdqu (%1),%%xmm1 \n" + "lea 0x10(%1),%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqa %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "jge 41b \n" + + "49: \n" + "add $0x3,%3 \n" + "jl 99f \n" + + // 1 pixel loop. + "91: \n" + "movd (%0),%%xmm3 \n" + "lea 0x4(%0),%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movd (%1),%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "pshufhw $0xf5,%%xmm3,%%xmm3 \n" + "pshuflw $0xf5,%%xmm3,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movd (%1),%%xmm1 \n" + "lea 0x4(%1),%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "sub $0x1,%3 \n" + "movd %%xmm0,(%2) \n" + "lea 0x4(%2),%2 \n" + "jge 91b \n" + "99: \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} +#endif // HAS_ARGBBLENDROW_SSE2 + +#ifdef HAS_ARGBBLENDROW_SSSE3 +// Shuffle table for isolating alpha. +CONST uvec8 kShuffleAlpha = { + 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, + 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 +}; + +// Blend 8 pixels at a time +// Shuffle table for reversing the bytes. + +// Same as SSE2, but replaces +// psrlw xmm3, 8 // alpha +// pshufhw xmm3, xmm3,0F5h // 8 alpha words +// pshuflw xmm3, xmm3,0F5h +// with.. +// pshufb xmm3, kShuffleAlpha // alpha + +void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $0xf,%%xmm7 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x8,%%xmm6 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psllw $0x8,%%xmm5 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" + "sub $0x1,%3 \n" + "je 91f \n" + "jl 99f \n" + + // 1 pixel loop until destination pointer is aligned. + "10: \n" + "test $0xf,%2 \n" + "je 19f \n" + "movd (%0),%%xmm3 \n" + "lea 0x4(%0),%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movd (%1),%%xmm2 \n" + "pshufb %4,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movd (%1),%%xmm1 \n" + "lea 0x4(%1),%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "sub $0x1,%3 \n" + "movd %%xmm0,(%2) \n" + "lea 0x4(%2),%2 \n" + "jge 10b \n" + + "19: \n" + "add $1-4,%3 \n" + "jl 49f \n" + "test $0xf,%0 \n" + "jne 41f \n" + "test $0xf,%1 \n" + "jne 41f \n" + + // 4 pixel loop. + ".p2align 2 \n" + "40: \n" + "movdqa (%0),%%xmm3 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movdqa (%1),%%xmm2 \n" + "pshufb %4,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movdqa (%1),%%xmm1 \n" + "lea 0x10(%1),%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqa %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "jge 40b \n" + "jmp 49f \n" + + // 4 pixel unaligned loop. + ".p2align 2 \n" + "41: \n" + "movdqu (%0),%%xmm3 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movdqu (%1),%%xmm2 \n" + "pshufb %4,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movdqu (%1),%%xmm1 \n" + "lea 0x10(%1),%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqa %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "jge 41b \n" + + "49: \n" + "add $0x3,%3 \n" + "jl 99f \n" + + // 1 pixel loop. + "91: \n" + "movd (%0),%%xmm3 \n" + "lea 0x4(%0),%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movd (%1),%%xmm2 \n" + "pshufb %4,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movd (%1),%%xmm1 \n" + "lea 0x4(%1),%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "sub $0x1,%3 \n" + "movd %%xmm0,(%2) \n" + "lea 0x4(%2),%2 \n" + "jge 91b \n" + "99: \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : "m"(kShuffleAlpha) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} +#endif // HAS_ARGBBLENDROW_SSSE3 + +#ifdef HAS_ARGBATTENUATE_SSE2 +// Attenuate 4 pixels at a time. +// aligned to 16 bytes +void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { + asm volatile ( + "sub %0,%1 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrld $0x8,%%xmm5 \n" + + // 4 pixel loop. + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "pshufhw $0xff,%%xmm0,%%xmm2 \n" + "pshuflw $0xff,%%xmm2,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "movdqa (%0),%%xmm1 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "pshufhw $0xff,%%xmm1,%%xmm2 \n" + "pshuflw $0xff,%%xmm2,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "movdqa (%0),%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "pand %%xmm4,%%xmm2 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "pand %%xmm5,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0,(%0,%1,1) \n" + "lea 0x10(%0),%0 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_ARGBATTENUATE_SSE2 + +#ifdef HAS_ARGBATTENUATEROW_SSSE3 +// Shuffle table duplicating alpha +CONST uvec8 kShuffleAlpha0 = { + 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, +}; +CONST uvec8 kShuffleAlpha1 = { + 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, + 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, +}; +// Attenuate 4 pixels at a time. +// aligned to 16 bytes +void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { + asm volatile ( + "sub %0,%1 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "pslld $0x18,%%xmm3 \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + // 4 pixel loop. + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqa (%0),%%xmm1 \n" + "punpcklbw %%xmm1,%%xmm1 \n" + "pmulhuw %%xmm1,%%xmm0 \n" + "movdqa (%0),%%xmm1 \n" + "pshufb %%xmm5,%%xmm1 \n" + "movdqa (%0),%%xmm2 \n" + "punpckhbw %%xmm2,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "movdqa (%0),%%xmm2 \n" + "pand %%xmm3,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0,(%0,%1,1) \n" + "lea 0x10(%0),%0 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleAlpha0), // %3 + "m"(kShuffleAlpha1) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_ARGBATTENUATEROW_SSSE3 + +#ifdef HAS_ARGBUNATTENUATEROW_SSE2 +// Unattenuate 4 pixels at a time. +// aligned to 16 bytes +void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { - for (int x = 0; x < width; x += 2) { - uint8 u = u_buf[x >> 1]; - uint8 v = v_buf[x >> 1]; - uint8 y0 = y_buf[x]; - YuvPixel(y0, u, v, rgb_buf, 24, 0, 8, 16); - if ((x + 1) < width) { - uint8 y1 = y_buf[x + 1]; - YuvPixel(y1, u, v, rgb_buf + 4, 24, 0, 8, 16); - } - rgb_buf += 8; // Advance 2 pixels. - } + uintptr_t alpha = 0; + asm volatile ( + "sub %0,%1 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" + + // 4 pixel loop. + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movzb 0x3(%0),%3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "movd 0x0(%4,%3,4),%%xmm2 \n" + "movzb 0x7(%0),%3 \n" + "movd 0x0(%4,%3,4),%%xmm3 \n" + "pshuflw $0xc0,%%xmm2,%%xmm2 \n" + "pshuflw $0xc0,%%xmm3,%%xmm3 \n" + "movlhps %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "movdqa (%0),%%xmm1 \n" + "movzb 0xb(%0),%3 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "movd 0x0(%4,%3,4),%%xmm2 \n" + "movzb 0xf(%0),%3 \n" + "movd 0x0(%4,%3,4),%%xmm3 \n" + "pshuflw $0xc0,%%xmm2,%%xmm2 \n" + "pshuflw $0xc0,%%xmm3,%%xmm3 \n" + "movlhps %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "movdqa (%0),%%xmm2 \n" + "pand %%xmm4,%%xmm2 \n" + "packuswb %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0,(%0,%1,1) \n" + "lea 0x10(%0),%0 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width), // %2 + "+r"(alpha) // %3 + : "r"(fixed_invtbl8) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); } +#endif // HAS_ARGBUNATTENUATEROW_SSE2 -void FastConvertYUV444ToRGB32Row(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { - for (int x = 0; x < width; ++x) { - uint8 u = u_buf[x]; - uint8 v = v_buf[x]; - uint8 y = y_buf[x]; - YuvPixel(y, u, v, rgb_buf, 24, 16, 8, 0); - rgb_buf += 4; // Advance 1 pixel. - } +#ifdef HAS_ARGBGRAYROW_SSSE3 +// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R +CONST vec8 kARGBToGray = { + 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0 +}; + +// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels +void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { + asm volatile ( + "movdqa %3,%%xmm4 \n" + "sub %0,%1 \n" + + // 8 pixel loop. + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "phaddw %%xmm1,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movdqa (%0),%%xmm2 \n" + "movdqa 0x10(%0),%%xmm3 \n" + "psrld $0x18,%%xmm2 \n" + "psrld $0x18,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "packuswb %%xmm2,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpcklbw %%xmm2,%%xmm3 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm3,%%xmm0 \n" + "punpckhwd %%xmm3,%%xmm1 \n" + "sub $0x8,%2 \n" + "movdqa %%xmm0,(%0,%1,1) \n" + "movdqa %%xmm1,0x10(%0,%1,1) \n" + "lea 0x20(%0),%0 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kARGBToGray) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" +#endif + ); } +#endif // HAS_ARGBGRAYROW_SSSE3 -void FastConvertYToRGB32Row(const uint8* y_buf, - uint8* rgb_buf, - int width) { - for (int x = 0; x < width; ++x) { - uint8 y = y_buf[x]; - YuvPixel(y, 128, 128, rgb_buf, 24, 16, 8, 0); - rgb_buf += 4; // Advance 1 pixel. - } +#ifdef HAS_ARGBSEPIAROW_SSSE3 +// b = (r * 35 + g * 68 + b * 17) >> 7 +// g = (r * 45 + g * 88 + b * 22) >> 7 +// r = (r * 50 + g * 98 + b * 24) >> 7 +// Constant for ARGB color to sepia tone +CONST vec8 kARGBToSepiaB = { + 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 +}; + +CONST vec8 kARGBToSepiaG = { + 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 +}; + +CONST vec8 kARGBToSepiaR = { + 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 +}; + +// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. +void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { + asm volatile ( + "movdqa %2,%%xmm2 \n" + "movdqa %3,%%xmm3 \n" + "movdqa %4,%%xmm4 \n" + + // 8 pixel loop. + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm6 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "pmaddubsw %%xmm2,%%xmm6 \n" + "phaddw %%xmm6,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movdqa (%0),%%xmm5 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm5 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "phaddw %%xmm1,%%xmm5 \n" + "psrlw $0x7,%%xmm5 \n" + "packuswb %%xmm5,%%xmm5 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "movdqa (%0),%%xmm5 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm5 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "phaddw %%xmm1,%%xmm5 \n" + "psrlw $0x7,%%xmm5 \n" + "packuswb %%xmm5,%%xmm5 \n" + "movdqa (%0),%%xmm6 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "psrld $0x18,%%xmm6 \n" + "psrld $0x18,%%xmm1 \n" + "packuswb %%xmm1,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm5 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm5,%%xmm0 \n" + "punpckhwd %%xmm5,%%xmm1 \n" + "sub $0x8,%1 \n" + "movdqa %%xmm0,(%0) \n" + "movdqa %%xmm1,0x10(%0) \n" + "lea 0x20(%0),%0 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "m"(kARGBToSepiaB), // %2 + "m"(kARGBToSepiaG), // %3 + "m"(kARGBToSepiaR) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} +#endif // HAS_ARGBSEPIAROW_SSSE3 + +#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 +// Tranform 8 ARGB pixels (32 bytes) with color matrix. +// Same as Sepia except matrix is provided. +void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb, + int width) { + asm volatile ( + "movd (%2),%%xmm2 \n" + "movd 0x4(%2),%%xmm3 \n" + "movd 0x8(%2),%%xmm4 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + + // 8 pixel loop. + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm6 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "pmaddubsw %%xmm2,%%xmm6 \n" + "movdqa (%0),%%xmm5 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm5 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "phaddsw %%xmm6,%%xmm0 \n" + "phaddsw %%xmm1,%%xmm5 \n" + "psraw $0x7,%%xmm0 \n" + "psraw $0x7,%%xmm5 \n" + "packuswb %%xmm0,%%xmm0 \n" + "packuswb %%xmm5,%%xmm5 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "movdqa (%0),%%xmm5 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm5 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "phaddsw %%xmm1,%%xmm5 \n" + "psraw $0x7,%%xmm5 \n" + "packuswb %%xmm5,%%xmm5 \n" + "movdqa (%0),%%xmm6 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "psrld $0x18,%%xmm6 \n" + "psrld $0x18,%%xmm1 \n" + "packuswb %%xmm1,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm6,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm0 \n" + "punpckhwd %%xmm5,%%xmm1 \n" + "sub $0x8,%1 \n" + "movdqa %%xmm0,(%0) \n" + "movdqa %%xmm1,0x10(%0) \n" + "lea 0x20(%0),%0 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "r"(matrix_argb) // %2 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); } +#endif // HAS_ARGBCOLORMATRIXROW_SSSE3 +#ifdef HAS_ARGBQUANTIZEROW_SSE2 +// Quantize 4 ARGB pixels (16 bytes). +// aligned to 16 bytes +void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, + int interval_offset, int width) { + asm volatile ( + "movd %2,%%xmm2 \n" + "movd %3,%%xmm3 \n" + "movd %4,%%xmm4 \n" + "pshuflw $0x40,%%xmm2,%%xmm2 \n" + "pshufd $0x44,%%xmm2,%%xmm2 \n" + "pshuflw $0x40,%%xmm3,%%xmm3 \n" + "pshufd $0x44,%%xmm3,%%xmm3 \n" + "pshuflw $0x40,%%xmm4,%%xmm4 \n" + "pshufd $0x44,%%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "pslld $0x18,%%xmm6 \n" + + // 4 pixel loop. + ".p2align 2 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "movdqa (%0),%%xmm1 \n" + "punpckhbw %%xmm5,%%xmm1 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "pmullw %%xmm3,%%xmm0 \n" + "movdqa (%0),%%xmm7 \n" + "pmullw %%xmm3,%%xmm1 \n" + "pand %%xmm6,%%xmm7 \n" + "paddw %%xmm4,%%xmm0 \n" + "paddw %%xmm4,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "por %%xmm7,%%xmm0 \n" + "sub $0x4,%1 \n" + "movdqa %%xmm0,(%0) \n" + "lea 0x10(%0),%0 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "r"(scale), // %2 + "r"(interval_size), // %3 + "r"(interval_offset) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" #endif + ); +} +#endif // HAS_ARGBQUANTIZEROW_SSE2 + +#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 +// Creates a table of cumulative sums where each value is a sum of all values +// above and to the left of the value, inclusive of the value. +void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, + const int32* previous_cumsum, int width) { + asm volatile ( + "sub %1,%2 \n" + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "sub $0x4,%3 \n" + "jl 49f \n" + "test $0xf,%1 \n" + "jne 49f \n" + + // 4 pixel loop \n" + ".p2align 2 \n" + "40: \n" + "movdqu (%0),%%xmm2 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm2,%%xmm4 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm1,%%xmm2 \n" + "punpckhwd %%xmm1,%%xmm3 \n" + "punpckhbw %%xmm1,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "punpcklwd %%xmm1,%%xmm4 \n" + "punpckhwd %%xmm1,%%xmm5 \n" + "paddd %%xmm2,%%xmm0 \n" + "movdqa (%1,%2,1),%%xmm2 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm0 \n" + "movdqa 0x10(%1,%2,1),%%xmm3 \n" + "paddd %%xmm0,%%xmm3 \n" + "paddd %%xmm4,%%xmm0 \n" + "movdqa 0x20(%1,%2,1),%%xmm4 \n" + "paddd %%xmm0,%%xmm4 \n" + "paddd %%xmm5,%%xmm0 \n" + "movdqa 0x30(%1,%2,1),%%xmm5 \n" + "paddd %%xmm0,%%xmm5 \n" + "movdqa %%xmm2,(%1) \n" + "movdqa %%xmm3,0x10(%1) \n" + "movdqa %%xmm4,0x20(%1) \n" + "movdqa %%xmm5,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x4,%3 \n" + "jge 40b \n" + + "49: \n" + "add $0x3,%3 \n" + "jl 19f \n" + + // 1 pixel loop \n" + ".p2align 2 \n" + "10: \n" + "movd (%0),%%xmm2 \n" + "lea 0x4(%0),%0 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "punpcklwd %%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm0 \n" + "movdqu (%1,%2,1),%%xmm2 \n" + "paddd %%xmm0,%%xmm2 \n" + "movdqu %%xmm2,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x1,%3 \n" + "jge 10b \n" + "19: \n" + : "+r"(row), // %0 + "+r"(cumsum), // %1 + "+r"(previous_cumsum), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 + +#ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2 +void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft, + int width, int area, uint8* dst, int count) { + asm volatile ( + "movd %5,%%xmm4 \n" + "cvtdq2ps %%xmm4,%%xmm4 \n" + "rcpss %%xmm4,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "sub $0x4,%3 \n" + "jl 49f \n" + + // 4 pixel loop \n" + ".p2align 2 \n" + "40: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa 0x20(%0),%%xmm2 \n" + "movdqa 0x30(%0),%%xmm3 \n" + "psubd (%0,%4,4),%%xmm0 \n" + "psubd 0x10(%0,%4,4),%%xmm1 \n" + "psubd 0x20(%0,%4,4),%%xmm2 \n" + "psubd 0x30(%0,%4,4),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "psubd (%1),%%xmm0 \n" + "psubd 0x10(%1),%%xmm1 \n" + "psubd 0x20(%1),%%xmm2 \n" + "psubd 0x30(%1),%%xmm3 \n" + "paddd (%1,%4,4),%%xmm0 \n" + "paddd 0x10(%1,%4,4),%%xmm1 \n" + "paddd 0x20(%1,%4,4),%%xmm2 \n" + "paddd 0x30(%1,%4,4),%%xmm3 \n" + "lea 0x40(%1),%1 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "cvtdq2ps %%xmm1,%%xmm1 \n" + "mulps %%xmm4,%%xmm0 \n" + "mulps %%xmm4,%%xmm1 \n" + "cvtdq2ps %%xmm2,%%xmm2 \n" + "cvtdq2ps %%xmm3,%%xmm3 \n" + "mulps %%xmm4,%%xmm2 \n" + "mulps %%xmm4,%%xmm3 \n" + "cvtps2dq %%xmm0,%%xmm0 \n" + "cvtps2dq %%xmm1,%%xmm1 \n" + "cvtps2dq %%xmm2,%%xmm2 \n" + "cvtps2dq %%xmm3,%%xmm3 \n" + "packssdw %%xmm1,%%xmm0 \n" + "packssdw %%xmm3,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jge 40b \n" + + "49: \n" + "add $0x3,%3 \n" + "jl 19f \n" + + // 1 pixel loop \n" + ".p2align 2 \n" + "10: \n" + "movdqa (%0),%%xmm0 \n" + "psubd (%0,%4,4),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "psubd (%1),%%xmm0 \n" + "paddd (%1,%4,4),%%xmm0 \n" + "lea 0x10(%1),%1 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "mulps %%xmm4,%%xmm0 \n" + "cvtps2dq %%xmm0,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,(%2) \n" + "lea 0x4(%2),%2 \n" + "sub $0x1,%3 \n" + "jge 10b \n" + "19: \n" + : "+r"(topleft), // %0 + "+r"(botleft), // %1 + "+r"(dst), // %2 + "+rm"(count) // %3 + : "r"(static_cast<intptr_t>(width)), // %4 + "rm"(area) // %5 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" +#endif + ); +} +#endif // HAS_CUMULATIVESUMTOAVERAGE_SSE2 +#ifdef HAS_ARGBSHADE_SSE2 +// Shade 4 pixels at a time by specified value. +// Aligned to 16 bytes. +void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value) { + asm volatile ( + "movd %3,%%xmm2 \n" + "sub %0,%1 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "punpcklqdq %%xmm2,%%xmm2 \n" + + // 4 pixel loop. + ".p2align 2 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0,(%0,%1,1) \n" + "lea 0x10(%0),%0 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(value) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2" +#endif + ); +} +#endif // HAS_ARGBSHADE_SSE2 + +#ifdef HAS_ARGBAFFINEROW_SSE2 +// TODO(fbarchard): Find 64 bit way to avoid masking. +// TODO(fbarchard): Investigate why 4 pixels is slower than 2 on Core2. +// Copy ARGB pixels from source image with slope to a row of destination. +// Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing +// an error if movq is used. movd %%xmm0,%1 + +LIBYUV_API +void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, + uint8* dst_argb, const float* uv_dudv, int width) { + intptr_t src_argb_stride_temp = src_argb_stride; + intptr_t temp = 0; + asm volatile ( + "movq (%3),%%xmm2 \n" + "movq 0x8(%3),%%xmm7 \n" + "shl $0x10,%1 \n" + "add $0x4,%1 \n" + "movd %1,%%xmm5 \n" + "sub $0x4,%4 \n" + "jl 49f \n" + + "pshufd $0x44,%%xmm7,%%xmm7 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "movdqa %%xmm2,%%xmm0 \n" + "addps %%xmm7,%%xmm0 \n" + "movlhps %%xmm0,%%xmm2 \n" + "movdqa %%xmm7,%%xmm4 \n" + "addps %%xmm4,%%xmm4 \n" + "movdqa %%xmm2,%%xmm3 \n" + "addps %%xmm4,%%xmm3 \n" + "addps %%xmm4,%%xmm4 \n" + + // 4 pixel loop \n" + ".p2align 4 \n" + "40: \n" + "cvttps2dq %%xmm2,%%xmm0 \n" + "cvttps2dq %%xmm3,%%xmm1 \n" + "packssdw %%xmm1,%%xmm0 \n" + "pmaddwd %%xmm5,%%xmm0 \n" +#if defined(__x86_64__) + "movd %%xmm0,%1 \n" + "mov %1,%5 \n" + "and $0x0fffffff,%1 \n" + "shr $32,%5 \n" + "pshufd $0xEE,%%xmm0,%%xmm0 \n" +#else + "movd %%xmm0,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd %%xmm0,%5 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" +#endif + "movd (%0,%1,1),%%xmm1 \n" + "movd (%0,%5,1),%%xmm6 \n" + "punpckldq %%xmm6,%%xmm1 \n" + "addps %%xmm4,%%xmm2 \n" + "movq %%xmm1,(%2) \n" +#if defined(__x86_64__) + "movd %%xmm0,%1 \n" + "mov %1,%5 \n" + "and $0x0fffffff,%1 \n" + "shr $32,%5 \n" +#else + "movd %%xmm0,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd %%xmm0,%5 \n" +#endif + "movd (%0,%1,1),%%xmm0 \n" + "movd (%0,%5,1),%%xmm6 \n" + "punpckldq %%xmm6,%%xmm0 \n" + "addps %%xmm4,%%xmm3 \n" + "sub $0x4,%4 \n" + "movq %%xmm0,0x08(%2) \n" + "lea 0x10(%2),%2 \n" + "jge 40b \n" + + "49: \n" + "add $0x3,%4 \n" + "jl 19f \n" + + // 1 pixel loop \n" + ".p2align 4 \n" + "10: \n" + "cvttps2dq %%xmm2,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "pmaddwd %%xmm5,%%xmm0 \n" + "addps %%xmm7,%%xmm2 \n" + "movd %%xmm0,%1 \n" +#if defined(__x86_64__) + "and $0x0fffffff,%1 \n" +#endif + "movd (%0,%1,1),%%xmm0 \n" + "sub $0x1,%4 \n" + "movd %%xmm0,(%2) \n" + "lea 0x4(%2),%2 \n" + "jge 10b \n" + "19: \n" + : "+r"(src_argb), // %0 + "+r"(src_argb_stride_temp), // %1 + "+r"(dst_argb), // %2 + "+r"(uv_dudv), // %3 + "+rm"(width), // %4 + "+r"(temp) // %5 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} +#endif // HAS_ARGBAFFINEROW_SSE2 + +// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version +void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + asm volatile ( + "sub %1,%0 \n" + "shr %3 \n" + "cmp $0x0,%3 \n" + "je 2f \n" + "cmp $0x40,%3 \n" + "je 3f \n" + "movd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x80,%3 \n" + "movd %3,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%1),%%xmm0 \n" + "movdqa (%1,%4,1),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "pmaddubsw %%xmm5,%%xmm0 \n" + "pmaddubsw %%xmm5,%%xmm1 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0,(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "jg 1b \n" + "jmp 4f \n" + ".p2align 4 \n" + "2: \n" + "movdqa (%1),%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0,(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "jg 2b \n" + "jmp 4f \n" + ".p2align 4 \n" + "3: \n" + "movdqa (%1),%%xmm0 \n" + "pavgb (%1,%4,1),%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0,(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "jg 3b \n" + "4: \n" + ".p2align 4 \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(source_y_fraction) // %3 + : "r"(static_cast<intptr_t>(src_stride)) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm5" +#endif + ); +} + +#endif // defined(__x86_64__) || defined(__i386__) + +#ifdef __cplusplus } // extern "C" +} // namespace libyuv +#endif diff --git a/files/source/row_table.cc b/files/source/row_table.cc deleted file mode 100644 index 022d9f88..00000000 --- a/files/source/row_table.cc +++ /dev/null @@ -1,469 +0,0 @@ -/* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "row.h" - -#define kMaxStride (2048 * 4) - -extern "C" { - -#define MAKETABLE(NAME) \ -SIMD_ALIGNED(const int16 NAME[256 * 3][4]) = {\ - RGBY(0x00), RGBY(0x01), RGBY(0x02), RGBY(0x03), \ - RGBY(0x04), RGBY(0x05), RGBY(0x06), RGBY(0x07), \ - RGBY(0x08), RGBY(0x09), RGBY(0x0A), RGBY(0x0B), \ - RGBY(0x0C), RGBY(0x0D), RGBY(0x0E), RGBY(0x0F), \ - RGBY(0x10), RGBY(0x11), RGBY(0x12), RGBY(0x13), \ - RGBY(0x14), RGBY(0x15), RGBY(0x16), RGBY(0x17), \ - RGBY(0x18), RGBY(0x19), RGBY(0x1A), RGBY(0x1B), \ - RGBY(0x1C), RGBY(0x1D), RGBY(0x1E), RGBY(0x1F), \ - RGBY(0x20), RGBY(0x21), RGBY(0x22), RGBY(0x23), \ - RGBY(0x24), RGBY(0x25), RGBY(0x26), RGBY(0x27), \ - RGBY(0x28), RGBY(0x29), RGBY(0x2A), RGBY(0x2B), \ - RGBY(0x2C), RGBY(0x2D), RGBY(0x2E), RGBY(0x2F), \ - RGBY(0x30), RGBY(0x31), RGBY(0x32), RGBY(0x33), \ - RGBY(0x34), RGBY(0x35), RGBY(0x36), RGBY(0x37), \ - RGBY(0x38), RGBY(0x39), RGBY(0x3A), RGBY(0x3B), \ - RGBY(0x3C), RGBY(0x3D), RGBY(0x3E), RGBY(0x3F), \ - RGBY(0x40), RGBY(0x41), RGBY(0x42), RGBY(0x43), \ - RGBY(0x44), RGBY(0x45), RGBY(0x46), RGBY(0x47), \ - RGBY(0x48), RGBY(0x49), RGBY(0x4A), RGBY(0x4B), \ - RGBY(0x4C), RGBY(0x4D), RGBY(0x4E), RGBY(0x4F), \ - RGBY(0x50), RGBY(0x51), RGBY(0x52), RGBY(0x53), \ - RGBY(0x54), RGBY(0x55), RGBY(0x56), RGBY(0x57), \ - RGBY(0x58), RGBY(0x59), RGBY(0x5A), RGBY(0x5B), \ - RGBY(0x5C), RGBY(0x5D), RGBY(0x5E), RGBY(0x5F), \ - RGBY(0x60), RGBY(0x61), RGBY(0x62), RGBY(0x63), \ - RGBY(0x64), RGBY(0x65), RGBY(0x66), RGBY(0x67), \ - RGBY(0x68), RGBY(0x69), RGBY(0x6A), RGBY(0x6B), \ - RGBY(0x6C), RGBY(0x6D), RGBY(0x6E), RGBY(0x6F), \ - RGBY(0x70), RGBY(0x71), RGBY(0x72), RGBY(0x73), \ - RGBY(0x74), RGBY(0x75), RGBY(0x76), RGBY(0x77), \ - RGBY(0x78), RGBY(0x79), RGBY(0x7A), RGBY(0x7B), \ - RGBY(0x7C), RGBY(0x7D), RGBY(0x7E), RGBY(0x7F), \ - RGBY(0x80), RGBY(0x81), RGBY(0x82), RGBY(0x83), \ - RGBY(0x84), RGBY(0x85), RGBY(0x86), RGBY(0x87), \ - RGBY(0x88), RGBY(0x89), RGBY(0x8A), RGBY(0x8B), \ - RGBY(0x8C), RGBY(0x8D), RGBY(0x8E), RGBY(0x8F), \ - RGBY(0x90), RGBY(0x91), RGBY(0x92), RGBY(0x93), \ - RGBY(0x94), RGBY(0x95), RGBY(0x96), RGBY(0x97), \ - RGBY(0x98), RGBY(0x99), RGBY(0x9A), RGBY(0x9B), \ - RGBY(0x9C), RGBY(0x9D), RGBY(0x9E), RGBY(0x9F), \ - RGBY(0xA0), RGBY(0xA1), RGBY(0xA2), RGBY(0xA3), \ - RGBY(0xA4), RGBY(0xA5), RGBY(0xA6), RGBY(0xA7), \ - RGBY(0xA8), RGBY(0xA9), RGBY(0xAA), RGBY(0xAB), \ - RGBY(0xAC), RGBY(0xAD), RGBY(0xAE), RGBY(0xAF), \ - RGBY(0xB0), RGBY(0xB1), RGBY(0xB2), RGBY(0xB3), \ - RGBY(0xB4), RGBY(0xB5), RGBY(0xB6), RGBY(0xB7), \ - RGBY(0xB8), RGBY(0xB9), RGBY(0xBA), RGBY(0xBB), \ - RGBY(0xBC), RGBY(0xBD), RGBY(0xBE), RGBY(0xBF), \ - RGBY(0xC0), RGBY(0xC1), RGBY(0xC2), RGBY(0xC3), \ - RGBY(0xC4), RGBY(0xC5), RGBY(0xC6), RGBY(0xC7), \ - RGBY(0xC8), RGBY(0xC9), RGBY(0xCA), RGBY(0xCB), \ - RGBY(0xCC), RGBY(0xCD), RGBY(0xCE), RGBY(0xCF), \ - RGBY(0xD0), RGBY(0xD1), RGBY(0xD2), RGBY(0xD3), \ - RGBY(0xD4), RGBY(0xD5), RGBY(0xD6), RGBY(0xD7), \ - RGBY(0xD8), RGBY(0xD9), RGBY(0xDA), RGBY(0xDB), \ - RGBY(0xDC), RGBY(0xDD), RGBY(0xDE), RGBY(0xDF), \ - RGBY(0xE0), RGBY(0xE1), RGBY(0xE2), RGBY(0xE3), \ - RGBY(0xE4), RGBY(0xE5), RGBY(0xE6), RGBY(0xE7), \ - RGBY(0xE8), RGBY(0xE9), RGBY(0xEA), RGBY(0xEB), \ - RGBY(0xEC), RGBY(0xED), RGBY(0xEE), RGBY(0xEF), \ - RGBY(0xF0), RGBY(0xF1), RGBY(0xF2), RGBY(0xF3), \ - RGBY(0xF4), RGBY(0xF5), RGBY(0xF6), RGBY(0xF7), \ - RGBY(0xF8), RGBY(0xF9), RGBY(0xFA), RGBY(0xFB), \ - RGBY(0xFC), RGBY(0xFD), RGBY(0xFE), RGBY(0xFF), \ - RGBU(0x00), RGBU(0x01), RGBU(0x02), RGBU(0x03), \ - RGBU(0x04), RGBU(0x05), RGBU(0x06), RGBU(0x07), \ - RGBU(0x08), RGBU(0x09), RGBU(0x0A), RGBU(0x0B), \ - RGBU(0x0C), RGBU(0x0D), RGBU(0x0E), RGBU(0x0F), \ - RGBU(0x10), RGBU(0x11), RGBU(0x12), RGBU(0x13), \ - RGBU(0x14), RGBU(0x15), RGBU(0x16), RGBU(0x17), \ - RGBU(0x18), RGBU(0x19), RGBU(0x1A), RGBU(0x1B), \ - RGBU(0x1C), RGBU(0x1D), RGBU(0x1E), RGBU(0x1F), \ - RGBU(0x20), RGBU(0x21), RGBU(0x22), RGBU(0x23), \ - RGBU(0x24), RGBU(0x25), RGBU(0x26), RGBU(0x27), \ - RGBU(0x28), RGBU(0x29), RGBU(0x2A), RGBU(0x2B), \ - RGBU(0x2C), RGBU(0x2D), RGBU(0x2E), RGBU(0x2F), \ - RGBU(0x30), RGBU(0x31), RGBU(0x32), RGBU(0x33), \ - RGBU(0x34), RGBU(0x35), RGBU(0x36), RGBU(0x37), \ - RGBU(0x38), RGBU(0x39), RGBU(0x3A), RGBU(0x3B), \ - RGBU(0x3C), RGBU(0x3D), RGBU(0x3E), RGBU(0x3F), \ - RGBU(0x40), RGBU(0x41), RGBU(0x42), RGBU(0x43), \ - RGBU(0x44), RGBU(0x45), RGBU(0x46), RGBU(0x47), \ - RGBU(0x48), RGBU(0x49), RGBU(0x4A), RGBU(0x4B), \ - RGBU(0x4C), RGBU(0x4D), RGBU(0x4E), RGBU(0x4F), \ - RGBU(0x50), RGBU(0x51), RGBU(0x52), RGBU(0x53), \ - RGBU(0x54), RGBU(0x55), RGBU(0x56), RGBU(0x57), \ - RGBU(0x58), RGBU(0x59), RGBU(0x5A), RGBU(0x5B), \ - RGBU(0x5C), RGBU(0x5D), RGBU(0x5E), RGBU(0x5F), \ - RGBU(0x60), RGBU(0x61), RGBU(0x62), RGBU(0x63), \ - RGBU(0x64), RGBU(0x65), RGBU(0x66), RGBU(0x67), \ - RGBU(0x68), RGBU(0x69), RGBU(0x6A), RGBU(0x6B), \ - RGBU(0x6C), RGBU(0x6D), RGBU(0x6E), RGBU(0x6F), \ - RGBU(0x70), RGBU(0x71), RGBU(0x72), RGBU(0x73), \ - RGBU(0x74), RGBU(0x75), RGBU(0x76), RGBU(0x77), \ - RGBU(0x78), RGBU(0x79), RGBU(0x7A), RGBU(0x7B), \ - RGBU(0x7C), RGBU(0x7D), RGBU(0x7E), RGBU(0x7F), \ - RGBU(0x80), RGBU(0x81), RGBU(0x82), RGBU(0x83), \ - RGBU(0x84), RGBU(0x85), RGBU(0x86), RGBU(0x87), \ - RGBU(0x88), RGBU(0x89), RGBU(0x8A), RGBU(0x8B), \ - RGBU(0x8C), RGBU(0x8D), RGBU(0x8E), RGBU(0x8F), \ - RGBU(0x90), RGBU(0x91), RGBU(0x92), RGBU(0x93), \ - RGBU(0x94), RGBU(0x95), RGBU(0x96), RGBU(0x97), \ - RGBU(0x98), RGBU(0x99), RGBU(0x9A), RGBU(0x9B), \ - RGBU(0x9C), RGBU(0x9D), RGBU(0x9E), RGBU(0x9F), \ - RGBU(0xA0), RGBU(0xA1), RGBU(0xA2), RGBU(0xA3), \ - RGBU(0xA4), RGBU(0xA5), RGBU(0xA6), RGBU(0xA7), \ - RGBU(0xA8), RGBU(0xA9), RGBU(0xAA), RGBU(0xAB), \ - RGBU(0xAC), RGBU(0xAD), RGBU(0xAE), RGBU(0xAF), \ - RGBU(0xB0), RGBU(0xB1), RGBU(0xB2), RGBU(0xB3), \ - RGBU(0xB4), RGBU(0xB5), RGBU(0xB6), RGBU(0xB7), \ - RGBU(0xB8), RGBU(0xB9), RGBU(0xBA), RGBU(0xBB), \ - RGBU(0xBC), RGBU(0xBD), RGBU(0xBE), RGBU(0xBF), \ - RGBU(0xC0), RGBU(0xC1), RGBU(0xC2), RGBU(0xC3), \ - RGBU(0xC4), RGBU(0xC5), RGBU(0xC6), RGBU(0xC7), \ - RGBU(0xC8), RGBU(0xC9), RGBU(0xCA), RGBU(0xCB), \ - RGBU(0xCC), RGBU(0xCD), RGBU(0xCE), RGBU(0xCF), \ - RGBU(0xD0), RGBU(0xD1), RGBU(0xD2), RGBU(0xD3), \ - RGBU(0xD4), RGBU(0xD5), RGBU(0xD6), RGBU(0xD7), \ - RGBU(0xD8), RGBU(0xD9), RGBU(0xDA), RGBU(0xDB), \ - RGBU(0xDC), RGBU(0xDD), RGBU(0xDE), RGBU(0xDF), \ - RGBU(0xE0), RGBU(0xE1), RGBU(0xE2), RGBU(0xE3), \ - RGBU(0xE4), RGBU(0xE5), RGBU(0xE6), RGBU(0xE7), \ - RGBU(0xE8), RGBU(0xE9), RGBU(0xEA), RGBU(0xEB), \ - RGBU(0xEC), RGBU(0xED), RGBU(0xEE), RGBU(0xEF), \ - RGBU(0xF0), RGBU(0xF1), RGBU(0xF2), RGBU(0xF3), \ - RGBU(0xF4), RGBU(0xF5), RGBU(0xF6), RGBU(0xF7), \ - RGBU(0xF8), RGBU(0xF9), RGBU(0xFA), RGBU(0xFB), \ - RGBU(0xFC), RGBU(0xFD), RGBU(0xFE), RGBU(0xFF), \ - RGBV(0x00), RGBV(0x01), RGBV(0x02), RGBV(0x03), \ - RGBV(0x04), RGBV(0x05), RGBV(0x06), RGBV(0x07), \ - RGBV(0x08), RGBV(0x09), RGBV(0x0A), RGBV(0x0B), \ - RGBV(0x0C), RGBV(0x0D), RGBV(0x0E), RGBV(0x0F), \ - RGBV(0x10), RGBV(0x11), RGBV(0x12), RGBV(0x13), \ - RGBV(0x14), RGBV(0x15), RGBV(0x16), RGBV(0x17), \ - RGBV(0x18), RGBV(0x19), RGBV(0x1A), RGBV(0x1B), \ - RGBV(0x1C), RGBV(0x1D), RGBV(0x1E), RGBV(0x1F), \ - RGBV(0x20), RGBV(0x21), RGBV(0x22), RGBV(0x23), \ - RGBV(0x24), RGBV(0x25), RGBV(0x26), RGBV(0x27), \ - RGBV(0x28), RGBV(0x29), RGBV(0x2A), RGBV(0x2B), \ - RGBV(0x2C), RGBV(0x2D), RGBV(0x2E), RGBV(0x2F), \ - RGBV(0x30), RGBV(0x31), RGBV(0x32), RGBV(0x33), \ - RGBV(0x34), RGBV(0x35), RGBV(0x36), RGBV(0x37), \ - RGBV(0x38), RGBV(0x39), RGBV(0x3A), RGBV(0x3B), \ - RGBV(0x3C), RGBV(0x3D), RGBV(0x3E), RGBV(0x3F), \ - RGBV(0x40), RGBV(0x41), RGBV(0x42), RGBV(0x43), \ - RGBV(0x44), RGBV(0x45), RGBV(0x46), RGBV(0x47), \ - RGBV(0x48), RGBV(0x49), RGBV(0x4A), RGBV(0x4B), \ - RGBV(0x4C), RGBV(0x4D), RGBV(0x4E), RGBV(0x4F), \ - RGBV(0x50), RGBV(0x51), RGBV(0x52), RGBV(0x53), \ - RGBV(0x54), RGBV(0x55), RGBV(0x56), RGBV(0x57), \ - RGBV(0x58), RGBV(0x59), RGBV(0x5A), RGBV(0x5B), \ - RGBV(0x5C), RGBV(0x5D), RGBV(0x5E), RGBV(0x5F), \ - RGBV(0x60), RGBV(0x61), RGBV(0x62), RGBV(0x63), \ - RGBV(0x64), RGBV(0x65), RGBV(0x66), RGBV(0x67), \ - RGBV(0x68), RGBV(0x69), RGBV(0x6A), RGBV(0x6B), \ - RGBV(0x6C), RGBV(0x6D), RGBV(0x6E), RGBV(0x6F), \ - RGBV(0x70), RGBV(0x71), RGBV(0x72), RGBV(0x73), \ - RGBV(0x74), RGBV(0x75), RGBV(0x76), RGBV(0x77), \ - RGBV(0x78), RGBV(0x79), RGBV(0x7A), RGBV(0x7B), \ - RGBV(0x7C), RGBV(0x7D), RGBV(0x7E), RGBV(0x7F), \ - RGBV(0x80), RGBV(0x81), RGBV(0x82), RGBV(0x83), \ - RGBV(0x84), RGBV(0x85), RGBV(0x86), RGBV(0x87), \ - RGBV(0x88), RGBV(0x89), RGBV(0x8A), RGBV(0x8B), \ - RGBV(0x8C), RGBV(0x8D), RGBV(0x8E), RGBV(0x8F), \ - RGBV(0x90), RGBV(0x91), RGBV(0x92), RGBV(0x93), \ - RGBV(0x94), RGBV(0x95), RGBV(0x96), RGBV(0x97), \ - RGBV(0x98), RGBV(0x99), RGBV(0x9A), RGBV(0x9B), \ - RGBV(0x9C), RGBV(0x9D), RGBV(0x9E), RGBV(0x9F), \ - RGBV(0xA0), RGBV(0xA1), RGBV(0xA2), RGBV(0xA3), \ - RGBV(0xA4), RGBV(0xA5), RGBV(0xA6), RGBV(0xA7), \ - RGBV(0xA8), RGBV(0xA9), RGBV(0xAA), RGBV(0xAB), \ - RGBV(0xAC), RGBV(0xAD), RGBV(0xAE), RGBV(0xAF), \ - RGBV(0xB0), RGBV(0xB1), RGBV(0xB2), RGBV(0xB3), \ - RGBV(0xB4), RGBV(0xB5), RGBV(0xB6), RGBV(0xB7), \ - RGBV(0xB8), RGBV(0xB9), RGBV(0xBA), RGBV(0xBB), \ - RGBV(0xBC), RGBV(0xBD), RGBV(0xBE), RGBV(0xBF), \ - RGBV(0xC0), RGBV(0xC1), RGBV(0xC2), RGBV(0xC3), \ - RGBV(0xC4), RGBV(0xC5), RGBV(0xC6), RGBV(0xC7), \ - RGBV(0xC8), RGBV(0xC9), RGBV(0xCA), RGBV(0xCB), \ - RGBV(0xCC), RGBV(0xCD), RGBV(0xCE), RGBV(0xCF), \ - RGBV(0xD0), RGBV(0xD1), RGBV(0xD2), RGBV(0xD3), \ - RGBV(0xD4), RGBV(0xD5), RGBV(0xD6), RGBV(0xD7), \ - RGBV(0xD8), RGBV(0xD9), RGBV(0xDA), RGBV(0xDB), \ - RGBV(0xDC), RGBV(0xDD), RGBV(0xDE), RGBV(0xDF), \ - RGBV(0xE0), RGBV(0xE1), RGBV(0xE2), RGBV(0xE3), \ - RGBV(0xE4), RGBV(0xE5), RGBV(0xE6), RGBV(0xE7), \ - RGBV(0xE8), RGBV(0xE9), RGBV(0xEA), RGBV(0xEB), \ - RGBV(0xEC), RGBV(0xED), RGBV(0xEE), RGBV(0xEF), \ - RGBV(0xF0), RGBV(0xF1), RGBV(0xF2), RGBV(0xF3), \ - RGBV(0xF4), RGBV(0xF5), RGBV(0xF6), RGBV(0xF7), \ - RGBV(0xF8), RGBV(0xF9), RGBV(0xFA), RGBV(0xFB), \ - RGBV(0xFC), RGBV(0xFD), RGBV(0xFE), RGBV(0xFF), \ -}; - -// ARGB table -#define RGBY(i) { \ - static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ - static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ - static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ - static_cast<int16>(256 * 64 - 1) \ -} - -#define RGBU(i) { \ - static_cast<int16>(2.018 * 64 * (i - 128) + 0.5), \ - static_cast<int16>(-0.391 * 64 * (i - 128) + 0.5), \ - 0, \ - 0 \ -} - -#define RGBV(i) { \ - 0, \ - static_cast<int16>(-0.813 * 64 * (i - 128) + 0.5), \ - static_cast<int16>(1.596 * 64 * (i - 128) + 0.5), \ - 0 \ -} - -#ifdef OSX -MAKETABLE(kCoefficientsRgbY) -#else -MAKETABLE(_kCoefficientsRgbY) -#endif - -#undef RGBY -#undef RGBU -#undef RGBV - -// BGRA table -#define RGBY(i) { \ - static_cast<int16>(256 * 64 - 1), \ - static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ - static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ - static_cast<int16>(1.164 * 64 * (i - 16) + 0.5) \ -} - -#define RGBU(i) { \ - 0, \ - 0, \ - static_cast<int16>(-0.391 * 64 * (i - 128) + 0.5), \ - static_cast<int16>(2.018 * 64 * (i - 128) + 0.5) \ -} - -#define RGBV(i) { \ - 0, \ - static_cast<int16>(1.596 * 64 * (i - 128) + 0.5), \ - static_cast<int16>(-0.813 * 64 * (i - 128) + 0.5), \ - 0 \ -} - -#ifdef OSX -MAKETABLE(kCoefficientsBgraY) -#else -MAKETABLE(_kCoefficientsBgraY) -#endif - - -#undef RGBY -#undef RGBU -#undef RGBV - -// ABGR table -#define RGBY(i) { \ - static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ - static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ - static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ - static_cast<int16>(256 * 64 - 1) \ -} - -#define RGBU(i) { \ - 0, \ - static_cast<int16>(-0.391 * 64 * (i - 128) + 0.5), \ - static_cast<int16>(2.018 * 64 * (i - 128) + 0.5), \ - 0 \ -} - -#define RGBV(i) { \ - static_cast<int16>(1.596 * 64 * (i - 128) + 0.5), \ - static_cast<int16>(-0.813 * 64 * (i - 128) + 0.5), \ - 0, \ - 0 \ -} - -#ifdef OSX -MAKETABLE(kCoefficientsAbgrY) -#else -MAKETABLE(_kCoefficientsAbgrY) -#endif - - -void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) { - for (int x = 0; x < pix; ++x) { - uint8 r = src_raw[0]; - uint8 g = src_raw[1]; - uint8 b = src_raw[2]; - dst_argb[0] = b; - dst_argb[1] = g; - dst_argb[2] = r; - dst_argb[3] = 255u; - dst_argb += 4; - src_raw += 3; - } -} - -void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix) { - for (int x = 0; x < pix; ++x) { - uint8 b = src_bg24[0]; - uint8 g = src_bg24[1]; - uint8 r = src_bg24[2]; - dst_argb[0] = b; - dst_argb[1] = g; - dst_argb[2] = r; - dst_argb[3] = 255u; - dst_argb[3] = 255u; - dst_argb += 4; - src_bg24 += 3; - } -} - -// C versions do the same -void RGB24ToYRow_C(const uint8* src_argb, uint8* dst_y, int pix) { - SIMD_ALIGNED(uint8 row[kMaxStride]); - BG24ToARGBRow_C(src_argb, row, pix); - ARGBToYRow_C(row, dst_y, pix); -} - -void RAWToYRow_C(const uint8* src_argb, uint8* dst_y, int pix) { - SIMD_ALIGNED(uint8 row[kMaxStride]); - RAWToARGBRow_C(src_argb, row, pix); - ARGBToYRow_C(row, dst_y, pix); -} - -void RGB24ToUVRow_C(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int pix) { - SIMD_ALIGNED(uint8 row[kMaxStride * 2]); - BG24ToARGBRow_C(src_argb, row, pix); - BG24ToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix); - ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix); -} - -void RAWToUVRow_C(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int pix) { - SIMD_ALIGNED(uint8 row[kMaxStride * 2]); - RAWToARGBRow_C(src_argb, row, pix); - RAWToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix); - ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix); -} - -static inline int RGBToY(uint8 r, uint8 g, uint8 b) { - return (( 66 * r + 129 * g + 25 * b + 128) >> 8) + 16; -} - -static inline int RGBToU(uint8 r, uint8 g, uint8 b) { - return ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; -} -static inline int RGBToV(uint8 r, uint8 g, uint8 b) { - return ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; -} - -#define MAKEROWY(NAME,R,G,B) \ -void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \ - for (int x = 0; x < width; ++x) { \ - dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \ - src_argb0 += 4; \ - dst_y += 1; \ - } \ -} \ -void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb, \ - uint8* dst_u, uint8* dst_v, int width) { \ - const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \ - for (int x = 0; x < width - 1; x += 2) { \ - uint8 ab = (src_rgb0[B] + src_rgb0[B + 4] + \ - src_rgb1[B] + src_rgb1[B + 4]) >> 2; \ - uint8 ag = (src_rgb0[G] + src_rgb0[G + 4] + \ - src_rgb1[G] + src_rgb1[G + 4]) >> 2; \ - uint8 ar = (src_rgb0[R] + src_rgb0[R + 4] + \ - src_rgb1[R] + src_rgb1[R + 4]) >> 2; \ - dst_u[0] = RGBToU(ar, ag, ab); \ - dst_v[0] = RGBToV(ar, ag, ab); \ - src_rgb0 += 8; \ - src_rgb1 += 8; \ - dst_u += 1; \ - dst_v += 1; \ - } \ - if (width & 1) { \ - uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \ - uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \ - uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \ - dst_u[0] = RGBToU(ar, ag, ab); \ - dst_v[0] = RGBToV(ar, ag, ab); \ - } \ -} - -MAKEROWY(ARGB,2,1,0) -MAKEROWY(BGRA,1,2,3) -MAKEROWY(ABGR,0,1,2) - -#if defined(HAS_RAWTOYROW_SSSE3) - -void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { - SIMD_ALIGNED(uint8 row[kMaxStride]); - BG24ToARGBRow_SSSE3(src_argb, row, pix); - ARGBToYRow_SSSE3(row, dst_y, pix); -} - -void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { - SIMD_ALIGNED(uint8 row[kMaxStride]); - RAWToARGBRow_SSSE3(src_argb, row, pix); - ARGBToYRow_SSSE3(row, dst_y, pix); -} - -#endif - -#if defined(HAS_RAWTOUVROW_SSSE3) -#if defined(HAS_ARGBTOUVROW_SSSE3) -void RGB24ToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int pix) { - SIMD_ALIGNED(uint8 row[kMaxStride * 2]); - BG24ToARGBRow_SSSE3(src_argb, row, pix); - BG24ToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix); - ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix); -} - -void RAWToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int pix) { - SIMD_ALIGNED(uint8 row[kMaxStride * 2]); - RAWToARGBRow_SSSE3(src_argb, row, pix); - RAWToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix); - ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix); -} - -#else - -void RGB24ToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int pix) { - SIMD_ALIGNED(uint8 row[kMaxStride * 2]); - BG24ToARGBRow_SSSE3(src_argb, row, pix); - BG24ToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix); - ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix); -} - -void RAWToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int pix) { - SIMD_ALIGNED(uint8 row[kMaxStride * 2]); - RAWToARGBRow_SSSE3(src_argb, row, pix); - RAWToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix); - ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix); -} - -#endif -#endif - -} // extern "C" diff --git a/files/source/row_win.cc b/files/source/row_win.cc index 2bc5fb13..e3b01f27 100644 --- a/files/source/row_win.cc +++ b/files/source/row_win.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -8,173 +8,925 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "row.h" +#include "libyuv/row.h" +#ifdef __cplusplus +namespace libyuv { extern "C" { +#endif + +// This module is for Visual C x86. +#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) +// TODO(fbarchard): I420ToRGB24, I420ToRAW #ifdef HAS_ARGBTOYROW_SSSE3 -#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var -// Constant multiplication table for converting ARGB to I400. -extern "C" TALIGN16(const int8, kARGBToY[16]) = { +// Constants for ARGB. +static const vec8 kARGBToY = { 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 }; -extern "C" TALIGN16(const int8, kARGBToU[16]) = { +static const vec8 kARGBToU = { 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 }; -extern "C" TALIGN16(const int8, kARGBToV[16]) = { +static const vec8 kARGBToV = { -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, }; -// Constants for BGRA -extern "C" TALIGN16(const int8, kBGRAToY[16]) = { +// Constants for BGRA. +static const vec8 kBGRAToY = { 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 }; -extern "C" TALIGN16(const int8, kBGRAToU[16]) = { +static const vec8 kBGRAToU = { 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 }; -extern "C" TALIGN16(const int8, kBGRAToV[16]) = { +static const vec8 kBGRAToV = { 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 }; -// Constants for ABGR -extern "C" TALIGN16(const int8, kABGRToY[16]) = { +// Constants for ABGR. +static const vec8 kABGRToY = { 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 }; -extern "C" TALIGN16(const int8, kABGRToU[16]) = { +static const vec8 kABGRToU = { -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 }; -extern "C" TALIGN16(const int8, kABGRToV[16]) = { +static const vec8 kABGRToV = { 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 }; -extern "C" TALIGN16(const uint8, kAddY16[16]) = { - 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, - 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, +// Constants for RGBA. +static const vec8 kRGBAToY = { + 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 +}; + +static const vec8 kRGBAToU = { + 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 +}; + +static const vec8 kRGBAToV = { + 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 +}; + +static const uvec8 kAddY16 = { + 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u }; -extern "C" TALIGN16(const uint8, kAddUV128[16]) = { +static const uvec8 kAddUV128 = { 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u }; -// Shuffle table for converting BG24 to ARGB. -extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = { +// Shuffle table for converting RGB24 to ARGB. +static const uvec8 kShuffleMaskRGB24ToARGB = { 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u }; // Shuffle table for converting RAW to ARGB. -extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = { +static const uvec8 kShuffleMaskRAWToARGB = { 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u }; -// Convert 16 ARGB pixels (64 bytes) to 16 Y values -__declspec(naked) +// Shuffle table for converting BGRA to ARGB. +static const uvec8 kShuffleMaskBGRAToARGB = { + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u +}; + +// Shuffle table for converting ABGR to ARGB. +static const uvec8 kShuffleMaskABGRToARGB = { + 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u +}; + +// Shuffle table for converting RGBA to ARGB. +static const uvec8 kShuffleMaskRGBAToARGB = { + 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u +}; + +// Shuffle table for converting ARGB to RGBA. +static const uvec8 kShuffleMaskARGBToRGBA = { + 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u +}; + +// Shuffle table for converting ARGB to RGB24. +static const uvec8 kShuffleMaskARGBToRGB24 = { + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u +}; + +// Shuffle table for converting ARGB to RAW. +static const uvec8 kShuffleMaskARGBToRAW = { + 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u +}; + +__declspec(naked) __declspec(align(16)) +void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { + __asm { + mov eax, [esp + 4] // src_y + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 + pslld xmm5, 24 + + align 16 + convertloop: + movq xmm0, qword ptr [eax] + lea eax, [eax + 8] + punpcklbw xmm0, xmm0 + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm0 + punpckhwd xmm1, xmm1 + por xmm0, xmm5 + por xmm1, xmm5 + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) { +__asm { + mov eax, [esp + 4] // src_bgra + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + movdqa xmm5, kShuffleMaskBGRAToARGB + sub edx, eax + + align 16 + convertloop: + movdqa xmm0, [eax] + pshufb xmm0, xmm5 + sub ecx, 4 + movdqa [eax + edx], xmm0 + lea eax, [eax + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) { +__asm { + mov eax, [esp + 4] // src_abgr + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + movdqa xmm5, kShuffleMaskABGRToARGB + sub edx, eax + + align 16 + convertloop: + movdqa xmm0, [eax] + pshufb xmm0, xmm5 + sub ecx, 4 + movdqa [eax + edx], xmm0 + lea eax, [eax + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) { +__asm { + mov eax, [esp + 4] // src_rgba + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + movdqa xmm5, kShuffleMaskRGBAToARGB + sub edx, eax + + align 16 + convertloop: + movdqa xmm0, [eax] + pshufb xmm0, xmm5 + sub ecx, 4 + movdqa [eax + edx], xmm0 + lea eax, [eax + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) { +__asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgba + mov ecx, [esp + 12] // pix + movdqa xmm5, kShuffleMaskARGBToRGBA + sub edx, eax + + align 16 + convertloop: + movdqa xmm0, [eax] + pshufb xmm0, xmm5 + sub ecx, 4 + movdqa [eax + edx], xmm0 + lea eax, [eax + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { +__asm { + mov eax, [esp + 4] // src_rgb24 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 + pslld xmm5, 24 + movdqa xmm4, kShuffleMaskRGB24ToARGB + + align 16 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm3, [eax + 32] + lea eax, [eax + 48] + movdqa xmm2, xmm3 + palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} + pshufb xmm2, xmm4 + por xmm2, xmm5 + palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} + pshufb xmm0, xmm4 + movdqa [edx + 32], xmm2 + por xmm0, xmm5 + pshufb xmm1, xmm4 + movdqa [edx], xmm0 + por xmm1, xmm5 + palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} + pshufb xmm3, xmm4 + movdqa [edx + 16], xmm1 + por xmm3, xmm5 + sub ecx, 16 + movdqa [edx + 48], xmm3 + lea edx, [edx + 64] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, + int pix) { +__asm { + mov eax, [esp + 4] // src_raw + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 + pslld xmm5, 24 + movdqa xmm4, kShuffleMaskRAWToARGB + + align 16 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm3, [eax + 32] + lea eax, [eax + 48] + movdqa xmm2, xmm3 + palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} + pshufb xmm2, xmm4 + por xmm2, xmm5 + palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} + pshufb xmm0, xmm4 + movdqa [edx + 32], xmm2 + por xmm0, xmm5 + pshufb xmm1, xmm4 + movdqa [edx], xmm0 + por xmm1, xmm5 + palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} + pshufb xmm3, xmm4 + movdqa [edx + 16], xmm1 + por xmm3, xmm5 + sub ecx, 16 + movdqa [edx + 48], xmm3 + lea edx, [edx + 64] + jg convertloop + ret + } +} + +// pmul method to replicate bits. +// Math to replicate bits: +// (v << 8) | (v << 3) +// v * 256 + v * 8 +// v * (256 + 8) +// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 +// 20 instructions. +__declspec(naked) __declspec(align(16)) +void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, + int pix) { +__asm { + mov eax, 0x01080108 // generate multiplier to repeat 5 bits + movd xmm5, eax + pshufd xmm5, xmm5, 0 + mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits + movd xmm6, eax + pshufd xmm6, xmm6, 0 + pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red + psllw xmm3, 11 + pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green + psllw xmm4, 10 + psrlw xmm4, 5 + pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha + psllw xmm7, 8 + + mov eax, [esp + 4] // src_rgb565 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + sub edx, eax + sub edx, eax + + align 16 + convertloop: + movdqu xmm0, [eax] // fetch 8 pixels of bgr565 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + pand xmm1, xmm3 // R in upper 5 bits + psllw xmm2, 11 // B in upper 5 bits + pmulhuw xmm1, xmm5 // * (256 + 8) + pmulhuw xmm2, xmm5 // * (256 + 8) + psllw xmm1, 8 + por xmm1, xmm2 // RB + pand xmm0, xmm4 // G in middle 6 bits + pmulhuw xmm0, xmm6 // << 5 * (256 + 4) + por xmm0, xmm7 // AG + movdqa xmm2, xmm1 + punpcklbw xmm1, xmm0 + punpckhbw xmm2, xmm0 + movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB + movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB + lea eax, [eax + 16] + sub ecx, 8 + jg convertloop + ret + } +} + +// 24 instructions +__declspec(naked) __declspec(align(16)) +void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, + int pix) { +__asm { + mov eax, 0x01080108 // generate multiplier to repeat 5 bits + movd xmm5, eax + pshufd xmm5, xmm5, 0 + mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits + movd xmm6, eax + pshufd xmm6, xmm6, 0 + pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red + psllw xmm3, 11 + movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green + psrlw xmm4, 6 + pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha + psllw xmm7, 8 + + mov eax, [esp + 4] // src_argb1555 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + sub edx, eax + sub edx, eax + + align 16 + convertloop: + movdqu xmm0, [eax] // fetch 8 pixels of 1555 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + psllw xmm1, 1 // R in upper 5 bits + psllw xmm2, 11 // B in upper 5 bits + pand xmm1, xmm3 + pmulhuw xmm2, xmm5 // * (256 + 8) + pmulhuw xmm1, xmm5 // * (256 + 8) + psllw xmm1, 8 + por xmm1, xmm2 // RB + movdqa xmm2, xmm0 + pand xmm0, xmm4 // G in middle 5 bits + psraw xmm2, 8 // A + pmulhuw xmm0, xmm6 // << 6 * (256 + 8) + pand xmm2, xmm7 + por xmm0, xmm2 // AG + movdqa xmm2, xmm1 + punpcklbw xmm1, xmm0 + punpckhbw xmm2, xmm0 + movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB + movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB + lea eax, [eax + 16] + sub ecx, 8 + jg convertloop + ret + } +} + +// 18 instructions. +__declspec(naked) __declspec(align(16)) +void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, + int pix) { +__asm { + mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f + movd xmm4, eax + pshufd xmm4, xmm4, 0 + movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles + pslld xmm5, 4 + mov eax, [esp + 4] // src_argb4444 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + sub edx, eax + sub edx, eax + + align 16 + convertloop: + movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 + movdqa xmm2, xmm0 + pand xmm0, xmm4 // mask low nibbles + pand xmm2, xmm5 // mask high nibbles + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + psllw xmm1, 4 + psrlw xmm3, 4 + por xmm0, xmm1 + por xmm2, xmm3 + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm2 + punpckhbw xmm1, xmm2 + movdqa [eax * 2 + edx], xmm0 // store 4 pixels of ARGB + movdqa [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB + lea eax, [eax + 16] + sub ecx, 8 + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { +__asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // pix + movdqa xmm6, kShuffleMaskARGBToRGB24 + + align 16 + convertloop: + movdqa xmm0, [eax] // fetch 16 pixels of argb + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + lea eax, [eax + 64] + pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB + pshufb xmm1, xmm6 + pshufb xmm2, xmm6 + pshufb xmm3, xmm6 + movdqa xmm4, xmm1 // 4 bytes from 1 for 0 + psrldq xmm1, 4 // 8 bytes from 1 + pslldq xmm4, 12 // 4 bytes from 1 for 0 + movdqa xmm5, xmm2 // 8 bytes from 2 for 1 + por xmm0, xmm4 // 4 bytes from 1 for 0 + pslldq xmm5, 8 // 8 bytes from 2 for 1 + movdqa [edx], xmm0 // store 0 + por xmm1, xmm5 // 8 bytes from 2 for 1 + psrldq xmm2, 8 // 4 bytes from 2 + pslldq xmm3, 4 // 12 bytes from 3 for 2 + por xmm2, xmm3 // 12 bytes from 3 for 2 + movdqa [edx + 16], xmm1 // store 1 + movdqa [edx + 32], xmm2 // store 2 + lea edx, [edx + 48] + sub ecx, 16 + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { +__asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // pix + movdqa xmm6, kShuffleMaskARGBToRAW + + align 16 + convertloop: + movdqa xmm0, [eax] // fetch 16 pixels of argb + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + lea eax, [eax + 64] + pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB + pshufb xmm1, xmm6 + pshufb xmm2, xmm6 + pshufb xmm3, xmm6 + movdqa xmm4, xmm1 // 4 bytes from 1 for 0 + psrldq xmm1, 4 // 8 bytes from 1 + pslldq xmm4, 12 // 4 bytes from 1 for 0 + movdqa xmm5, xmm2 // 8 bytes from 2 for 1 + por xmm0, xmm4 // 4 bytes from 1 for 0 + pslldq xmm5, 8 // 8 bytes from 2 for 1 + movdqa [edx], xmm0 // store 0 + por xmm1, xmm5 // 8 bytes from 2 for 1 + psrldq xmm2, 8 // 4 bytes from 2 + pslldq xmm3, 4 // 12 bytes from 3 for 2 + por xmm2, xmm3 // 12 bytes from 3 for 2 + movdqa [edx + 16], xmm1 // store 1 + movdqa [edx + 32], xmm2 // store 2 + lea edx, [edx + 48] + sub ecx, 16 + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { +__asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // pix + pcmpeqb xmm3, xmm3 // generate mask 0x0000001f + psrld xmm3, 27 + pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 + psrld xmm4, 26 + pslld xmm4, 5 + pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 + pslld xmm5, 11 + + align 16 + convertloop: + movdqa xmm0, [eax] // fetch 4 pixels of argb + movdqa xmm1, xmm0 // B + movdqa xmm2, xmm0 // G + pslld xmm0, 8 // R + psrld xmm1, 3 // B + psrld xmm2, 5 // G + psrad xmm0, 16 // R + pand xmm1, xmm3 // B + pand xmm2, xmm4 // G + pand xmm0, xmm5 // R + por xmm1, xmm2 // BG + por xmm0, xmm1 // BGR + packssdw xmm0, xmm0 + lea eax, [eax + 16] + movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 + lea edx, [edx + 8] + sub ecx, 4 + jg convertloop + ret + } +} + +// TODO(fbarchard): Improve sign extension/packing. +__declspec(naked) __declspec(align(16)) +void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { +__asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // pix + pcmpeqb xmm4, xmm4 // generate mask 0x0000001f + psrld xmm4, 27 + movdqa xmm5, xmm4 // generate mask 0x000003e0 + pslld xmm5, 5 + movdqa xmm6, xmm4 // generate mask 0x00007c00 + pslld xmm6, 10 + pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 + pslld xmm7, 15 + + align 16 + convertloop: + movdqa xmm0, [eax] // fetch 4 pixels of argb + movdqa xmm1, xmm0 // B + movdqa xmm2, xmm0 // G + movdqa xmm3, xmm0 // R + psrad xmm0, 16 // A + psrld xmm1, 3 // B + psrld xmm2, 6 // G + psrld xmm3, 9 // R + pand xmm0, xmm7 // A + pand xmm1, xmm4 // B + pand xmm2, xmm5 // G + pand xmm3, xmm6 // R + por xmm0, xmm1 // BA + por xmm2, xmm3 // GR + por xmm0, xmm2 // BGRA + packssdw xmm0, xmm0 + lea eax, [eax + 16] + movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 + lea edx, [edx + 8] + sub ecx, 4 + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { +__asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // pix + pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 + psllw xmm4, 12 + movdqa xmm3, xmm4 // generate mask 0x00f000f0 + psrlw xmm3, 8 + + align 16 + convertloop: + movdqa xmm0, [eax] // fetch 4 pixels of argb + movdqa xmm1, xmm0 + pand xmm0, xmm3 // low nibble + pand xmm1, xmm4 // high nibble + psrl xmm0, 4 + psrl xmm1, 8 + por xmm0, xmm1 + packuswb xmm0, xmm0 + lea eax, [eax + 16] + movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 + lea edx, [edx + 8] + sub ecx, 4 + jg convertloop + ret + } +} + +// Convert 16 ARGB pixels (64 bytes) to 16 Y values. +__declspec(naked) __declspec(align(16)) void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { __asm { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_y */ mov ecx, [esp + 12] /* pix */ - movdqa xmm7, _kARGBToY - movdqa xmm6, _kAddY16 + movdqa xmm5, kAddY16 + movdqa xmm4, kARGBToY - convertloop : + align 16 + convertloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] movdqa xmm2, [eax + 32] movdqa xmm3, [eax + 48] - pmaddubsw xmm0, xmm7 - pmaddubsw xmm1, xmm7 - pmaddubsw xmm2, xmm7 - pmaddubsw xmm3, xmm7 + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 lea eax, [eax + 64] phaddw xmm0, xmm1 phaddw xmm2, xmm3 psrlw xmm0, 7 psrlw xmm2, 7 packuswb xmm0, xmm2 - paddb xmm0, xmm6 + paddb xmm0, xmm5 + sub ecx, 16 movdqa [edx], xmm0 lea edx, [edx + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { +__asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm5, kAddY16 + movdqa xmm4, kARGBToY + + align 16 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 sub ecx, 16 - ja convertloop + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg convertloop ret } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { __asm { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_y */ mov ecx, [esp + 12] /* pix */ - movdqa xmm7, _kBGRAToY - movdqa xmm6, _kAddY16 + movdqa xmm5, kAddY16 + movdqa xmm4, kBGRAToY - convertloop : + align 16 + convertloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] movdqa xmm2, [eax + 32] movdqa xmm3, [eax + 48] - pmaddubsw xmm0, xmm7 - pmaddubsw xmm1, xmm7 - pmaddubsw xmm2, xmm7 - pmaddubsw xmm3, xmm7 + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 lea eax, [eax + 64] phaddw xmm0, xmm1 phaddw xmm2, xmm3 psrlw xmm0, 7 psrlw xmm2, 7 packuswb xmm0, xmm2 - paddb xmm0, xmm6 + paddb xmm0, xmm5 + sub ecx, 16 movdqa [edx], xmm0 lea edx, [edx + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { +__asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm5, kAddY16 + movdqa xmm4, kBGRAToY + + align 16 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 sub ecx, 16 - ja convertloop + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg convertloop ret } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { __asm { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_y */ mov ecx, [esp + 12] /* pix */ - movdqa xmm7, _kABGRToY - movdqa xmm6, _kAddY16 + movdqa xmm5, kAddY16 + movdqa xmm4, kABGRToY - convertloop : + align 16 + convertloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] movdqa xmm2, [eax + 32] movdqa xmm3, [eax + 48] - pmaddubsw xmm0, xmm7 - pmaddubsw xmm1, xmm7 - pmaddubsw xmm2, xmm7 - pmaddubsw xmm3, xmm7 + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 + sub ecx, 16 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { +__asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm5, kAddY16 + movdqa xmm4, kABGRToY + + align 16 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 lea eax, [eax + 64] phaddw xmm0, xmm1 phaddw xmm2, xmm3 psrlw xmm0, 7 psrlw xmm2, 7 packuswb xmm0, xmm2 - paddb xmm0, xmm6 + paddb xmm0, xmm5 + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { +__asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm5, kAddY16 + movdqa xmm4, kRGBAToY + + align 16 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 + sub ecx, 16 movdqa [edx], xmm0 lea edx, [edx + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { +__asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm5, kAddY16 + movdqa xmm4, kRGBAToY + + align 16 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 sub ecx, 16 - ja convertloop + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg convertloop ret } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { __asm { @@ -185,12 +937,13 @@ __asm { mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // pix - movdqa xmm7, _kARGBToU - movdqa xmm6, _kARGBToV - movdqa xmm5, _kAddUV128 + movdqa xmm7, kARGBToU + movdqa xmm6, kARGBToV + movdqa xmm5, kAddUV128 sub edi, edx // stride from u to v - convertloop : + align 16 + convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqa xmm0, [eax] movdqa xmm1, [eax + 16] @@ -227,18 +980,89 @@ __asm { paddb xmm0, xmm5 // -> unsigned // step 3 - store 8 U and 8 V values + sub ecx, 16 movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { +__asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, kARGBToU + movdqa xmm6, kARGBToV + movdqa xmm5, kAddUV128 + sub edi, edx // stride from u to v + + align 16 + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values sub ecx, 16 - ja convertloop + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + jg convertloop + pop edi pop esi ret } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { __asm { @@ -249,12 +1073,13 @@ __asm { mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // pix - movdqa xmm7, _kBGRAToU - movdqa xmm6, _kBGRAToV - movdqa xmm5, _kAddUV128 + movdqa xmm7, kBGRAToU + movdqa xmm6, kBGRAToV + movdqa xmm5, kAddUV128 sub edi, edx // stride from u to v - convertloop : + align 16 + convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqa xmm0, [eax] movdqa xmm1, [eax + 16] @@ -291,18 +1116,89 @@ __asm { paddb xmm0, xmm5 // -> unsigned // step 3 - store 8 U and 8 V values + sub ecx, 16 movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { +__asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, kBGRAToU + movdqa xmm6, kBGRAToV + movdqa xmm5, kAddUV128 + sub edi, edx // stride from u to v + + align 16 + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values sub ecx, 16 - ja convertloop + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + jg convertloop + pop edi pop esi ret } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { __asm { @@ -313,12 +1209,13 @@ __asm { mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // pix - movdqa xmm7, _kABGRToU - movdqa xmm6, _kABGRToV - movdqa xmm5, _kAddUV128 + movdqa xmm7, kABGRToU + movdqa xmm6, kABGRToV + movdqa xmm5, kAddUV128 sub edi, edx // stride from u to v - convertloop : + align 16 + convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqa xmm0, [eax] movdqa xmm1, [eax + 16] @@ -355,282 +1252,2846 @@ __asm { paddb xmm0, xmm5 // -> unsigned // step 3 - store 8 U and 8 V values + sub ecx, 16 movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { +__asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, kABGRToU + movdqa xmm6, kABGRToV + movdqa xmm5, kAddUV128 + sub edi, edx // stride from u to v + + align 16 + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values sub ecx, 16 - ja convertloop + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + jg convertloop + pop edi pop esi ret } } -__declspec(naked) -void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) { +__declspec(naked) __declspec(align(16)) +void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { __asm { - mov eax, [esp + 4] // src_bg24 - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // pix - pcmpeqb xmm7, xmm7 // generate mask 0xff000000 - pslld xmm7, 24 - movdqa xmm6, _kShuffleMaskBG24ToARGB + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, kRGBAToU + movdqa xmm6, kRGBAToV + movdqa xmm5, kAddUV128 + sub edi, edx // stride from u to v - convertloop : - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm3, [eax + 32] - lea eax, [eax + 48] - movdqa xmm2, xmm3 - palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} - pshufb xmm2, xmm6 - por xmm2, xmm7 - palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} - pshufb xmm0, xmm6 - movdqa [edx + 32], xmm2 - por xmm0, xmm7 - pshufb xmm1, xmm6 - movdqa [edx], xmm0 - por xmm1, xmm7 - palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} - pshufb xmm3, xmm6 - movdqa [edx + 16], xmm1 - por xmm3, xmm7 - movdqa [edx + 48], xmm3 - lea edx, [edx + 64] + align 16 + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pavgb xmm0, [eax + esi] + pavgb xmm1, [eax + esi + 16] + pavgb xmm2, [eax + esi + 32] + pavgb xmm3, [eax + esi + 48] + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + sub ecx, 16 + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { +__asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, kRGBAToU + movdqa xmm6, kRGBAToV + movdqa xmm5, kAddUV128 + sub edi, edx // stride from u to v + + align 16 + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + sub ecx, 16 + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + jg convertloop + + pop edi + pop esi + ret + } +} +#endif // HAS_ARGBTOYROW_SSSE3 + +#ifdef HAS_I422TOARGBROW_SSSE3 + +#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */ + +#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */ +#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */ +#define UR 0 + +#define VB 0 +#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */ +#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */ + +// Bias +#define BB UB * 128 + VB * 128 +#define BG UG * 128 + VG * 128 +#define BR UR * 128 + VR * 128 + +static const vec8 kUVToB = { + UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB +}; + +static const vec8 kUVToR = { + UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR +}; + +static const vec8 kUVToG = { + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG +}; + +static const vec8 kVUToB = { + VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, +}; + +static const vec8 kVUToR = { + VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, +}; + +static const vec8 kVUToG = { + VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, +}; + +static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG }; +static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 }; +static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB }; +static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG }; +static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; + +// TODO(fbarchard): NV12/NV21 fetch UV and use directly. +// TODO(fbarchard): Read that does half size on Y and treats 420 as 444. + +// Read 8 UV from 411. +#define READYUV444 __asm { \ + __asm movq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \ + __asm movq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \ + __asm lea esi, [esi + 8] \ + __asm punpcklbw xmm0, xmm1 /* UV */ \ + } + +// Read 4 UV from 422, upsample to 8 UV. +#define READYUV422 __asm { \ + __asm movd xmm0, [esi] /* U */ \ + __asm movd xmm1, [esi + edi] /* V */ \ + __asm lea esi, [esi + 4] \ + __asm punpcklbw xmm0, xmm1 /* UV */ \ + __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + } + +// Read 2 UV from 411, upsample to 8 UV. +#define READYUV411 __asm { \ + __asm movd xmm0, [esi] /* U */ \ + __asm movd xmm1, [esi + edi] /* V */ \ + __asm lea esi, [esi + 2] \ + __asm punpcklbw xmm0, xmm1 /* UV */ \ + __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + __asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \ + } + +// Read 4 UV from NV12, upsample to 8 UV. +#define READNV12 __asm { \ + __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \ + __asm lea esi, [esi + 8] \ + __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + } + +// Convert 8 pixels: 8 UV and 8 Y. +#define YUVTORGB __asm { \ + /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ + __asm movdqa xmm1, xmm0 \ + __asm movdqa xmm2, xmm0 \ + __asm pmaddubsw xmm0, kUVToB /* scale B UV */ \ + __asm pmaddubsw xmm1, kUVToG /* scale G UV */ \ + __asm pmaddubsw xmm2, kUVToR /* scale R UV */ \ + __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \ + __asm psubw xmm1, kUVBiasG \ + __asm psubw xmm2, kUVBiasR \ + /* Step 2: Find Y contribution to 8 R,G,B values */ \ + __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ + __asm lea eax, [eax + 8] \ + __asm punpcklbw xmm3, xmm4 \ + __asm psubsw xmm3, kYSub16 \ + __asm pmullw xmm3, kYToRgb \ + __asm paddsw xmm0, xmm3 /* B += Y */ \ + __asm paddsw xmm1, xmm3 /* G += Y */ \ + __asm paddsw xmm2, xmm3 /* R += Y */ \ + __asm psraw xmm0, 6 \ + __asm psraw xmm1, 6 \ + __asm psraw xmm2, 6 \ + __asm packuswb xmm0, xmm0 /* B */ \ + __asm packuswb xmm1, xmm1 /* G */ \ + __asm packuswb xmm2, xmm2 /* R */ \ + } + +// Convert 8 pixels: 8 VU and 8 Y. +#define YVUTORGB __asm { \ + /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ + __asm movdqa xmm1, xmm0 \ + __asm movdqa xmm2, xmm0 \ + __asm pmaddubsw xmm0, kVUToB /* scale B UV */ \ + __asm pmaddubsw xmm1, kVUToG /* scale G UV */ \ + __asm pmaddubsw xmm2, kVUToR /* scale R UV */ \ + __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \ + __asm psubw xmm1, kUVBiasG \ + __asm psubw xmm2, kUVBiasR \ + /* Step 2: Find Y contribution to 8 R,G,B values */ \ + __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ + __asm lea eax, [eax + 8] \ + __asm punpcklbw xmm3, xmm4 \ + __asm psubsw xmm3, kYSub16 \ + __asm pmullw xmm3, kYToRgb \ + __asm paddsw xmm0, xmm3 /* B += Y */ \ + __asm paddsw xmm1, xmm3 /* G += Y */ \ + __asm paddsw xmm2, xmm3 /* R += Y */ \ + __asm psraw xmm0, 6 \ + __asm psraw xmm1, 6 \ + __asm psraw xmm2, 6 \ + __asm packuswb xmm0, xmm0 /* B */ \ + __asm packuswb xmm1, xmm1 /* G */ \ + __asm packuswb xmm2, xmm2 /* R */ \ + } + +// 8 pixels, dest aligned 16. +// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) __declspec(align(16)) +void I444ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 16 + convertloop: + READYUV444 + YUVTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +// 8 pixels, dest aligned 16. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) __declspec(align(16)) +void I422ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 16 + convertloop: + READYUV422 + YUVTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +// 8 pixels, dest aligned 16. +// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +// Similar to I420 but duplicate UV once more. +__declspec(naked) __declspec(align(16)) +void I411ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 16 + convertloop: + READYUV411 + YUVTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +// 8 pixels, dest aligned 16. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) __declspec(align(16)) +void NV12ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* argb_buf, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // Y + mov esi, [esp + 4 + 8] // UV + mov edx, [esp + 4 + 12] // argb + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 16 + convertloop: + READNV12 + YUVTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop esi + ret + } +} + +// 8 pixels, dest aligned 16. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) __declspec(align(16)) +void NV21ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* argb_buf, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // Y + mov esi, [esp + 4 + 8] // VU + mov edx, [esp + 4 + 12] // argb + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 16 + convertloop: + READNV12 + YVUTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop esi + ret + } +} + +// 8 pixels, unaligned. +// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) __declspec(align(16)) +void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 16 + convertloop: + READYUV444 + YUVTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +// 8 pixels, unaligned. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) __declspec(align(16)) +void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 16 + convertloop: + READYUV422 + YUVTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +// 8 pixels, unaligned. +// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +// Similar to I420 but duplicate UV once more. +__declspec(naked) __declspec(align(16)) +void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 16 + convertloop: + READYUV411 + YUVTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + + +// 8 pixels, dest aligned 16. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) __declspec(align(16)) +void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* argb_buf, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // Y + mov esi, [esp + 4 + 8] // UV + mov edx, [esp + 4 + 12] // argb + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 16 + convertloop: + READNV12 + YUVTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop esi + ret + } +} + +// 8 pixels, dest aligned 16. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) __declspec(align(16)) +void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* argb_buf, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // Y + mov esi, [esp + 4 + 8] // VU + mov edx, [esp + 4 + 12] // argb + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 16 + convertloop: + READNV12 + YVUTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void I422ToBGRARow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* bgra_buf, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // bgra + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pxor xmm4, xmm4 + + align 16 + convertloop: + READYUV422 + YUVTORGB + + // Step 3: Weave into BGRA + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + punpcklbw xmm1, xmm0 // GB + punpcklbw xmm5, xmm2 // AR + movdqa xmm0, xmm5 + punpcklwd xmm5, xmm1 // BGRA first 4 pixels + punpckhwd xmm0, xmm1 // BGRA next 4 pixels + movdqa [edx], xmm5 + movdqa [edx + 16], xmm0 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* bgra_buf, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // bgra + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pxor xmm4, xmm4 + + align 16 + convertloop: + READYUV422 + YUVTORGB + + // Step 3: Weave into BGRA + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + punpcklbw xmm1, xmm0 // GB + punpcklbw xmm5, xmm2 // AR + movdqa xmm0, xmm5 + punpcklwd xmm5, xmm1 // BGRA first 4 pixels + punpckhwd xmm0, xmm1 // BGRA next 4 pixels + movdqu [edx], xmm5 + movdqu [edx + 16], xmm0 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void I422ToABGRRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* abgr_buf, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // abgr + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 16 + convertloop: + READYUV422 + YUVTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm2, xmm1 // RG + punpcklbw xmm0, xmm5 // BA + movdqa xmm1, xmm2 + punpcklwd xmm2, xmm0 // RGBA first 4 pixels + punpckhwd xmm1, xmm0 // RGBA next 4 pixels + movdqa [edx], xmm2 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* abgr_buf, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // abgr + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 16 + convertloop: + READYUV422 + YUVTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm2, xmm1 // RG + punpcklbw xmm0, xmm5 // BA + movdqa xmm1, xmm2 + punpcklwd xmm2, xmm0 // RGBA first 4 pixels + punpckhwd xmm1, xmm0 // RGBA next 4 pixels + movdqu [edx], xmm2 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void I422ToRGBARow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgba_buf, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // rgba + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pxor xmm4, xmm4 + + align 16 + convertloop: + READYUV422 + YUVTORGB + + // Step 3: Weave into RGBA + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + punpcklbw xmm1, xmm2 // GR + punpcklbw xmm5, xmm0 // AB + movdqa xmm0, xmm5 + punpcklwd xmm5, xmm1 // RGBA first 4 pixels + punpckhwd xmm0, xmm1 // RGBA next 4 pixels + movdqa [edx], xmm5 + movdqa [edx + 16], xmm0 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgba_buf, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // rgba + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pxor xmm4, xmm4 + + align 16 + convertloop: + READYUV422 + YUVTORGB + + // Step 3: Weave into RGBA + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + punpcklbw xmm1, xmm2 // GR + punpcklbw xmm5, xmm0 // AB + movdqa xmm0, xmm5 + punpcklwd xmm5, xmm1 // RGBA first 4 pixels + punpckhwd xmm0, xmm1 // RGBA next 4 pixels + movdqu [edx], xmm5 + movdqu [edx + 16], xmm0 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +#endif // HAS_I422TOARGBROW_SSSE3 + +#ifdef HAS_YTOARGBROW_SSE2 +__declspec(naked) __declspec(align(16)) +void YToARGBRow_SSE2(const uint8* y_buf, + uint8* rgb_buf, + int width) { + __asm { + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pslld xmm4, 24 + mov eax,0x10001000 + movd xmm3,eax + pshufd xmm3,xmm3,0 + mov eax,0x012a012a + movd xmm2,eax + pshufd xmm2,xmm2,0 + mov eax, [esp + 4] // Y + mov edx, [esp + 8] // rgb + mov ecx, [esp + 12] // width + + align 16 + convertloop: + // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 + movq xmm0, qword ptr [eax] + lea eax, [eax + 8] + punpcklbw xmm0, xmm0 // Y.Y + psubusw xmm0, xmm3 + pmulhuw xmm0, xmm2 + packuswb xmm0, xmm0 // G + + // Step 2: Weave into ARGB + punpcklbw xmm0, xmm0 // GG + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm0 // BGRA first 4 pixels + punpckhwd xmm1, xmm1 // BGRA next 4 pixels + por xmm0, xmm4 + por xmm1, xmm4 + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + ret + } +} +#endif // HAS_YTOARGBROW_SSE2 + +#ifdef HAS_MIRRORROW_SSSE3 + +// Shuffle table for reversing the bytes. +static const uvec8 kShuffleMirror = { + 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u +}; + +__declspec(naked) __declspec(align(16)) +void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { +__asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + movdqa xmm5, kShuffleMirror + lea eax, [eax - 16] + + align 16 + convertloop: + movdqa xmm0, [eax + ecx] + pshufb xmm0, xmm5 sub ecx, 16 - ja convertloop + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg convertloop ret } } +#endif // HAS_MIRRORROW_SSSE3 -__declspec(naked) -void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, - int pix) { +#ifdef HAS_MIRRORROW_SSE2 +// SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3 +// version can not. +__declspec(naked) __declspec(align(16)) +void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { __asm { - mov eax, [esp + 4] // src_raw - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // pix - pcmpeqb xmm7, xmm7 // generate mask 0xff000000 - pslld xmm7, 24 - movdqa xmm6, _kShuffleMaskRAWToARGB + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + lea eax, [eax - 16] + + align 16 + convertloop: + movdqu xmm0, [eax + ecx] + movdqa xmm1, xmm0 // swap bytes + psllw xmm0, 8 + psrlw xmm1, 8 + por xmm0, xmm1 + pshuflw xmm0, xmm0, 0x1b // swap words + pshufhw xmm0, xmm0, 0x1b + pshufd xmm0, xmm0, 0x4e // swap qwords + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} +#endif // HAS_MIRRORROW_SSE2 - convertloop : +#ifdef HAS_MIRRORROW_UV_SSSE3 +// Shuffle table for reversing the bytes of UV channels. +static const uvec8 kShuffleMirrorUV = { + 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u +}; + +__declspec(naked) __declspec(align(16)) +void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, + int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + movdqa xmm1, kShuffleMirrorUV + lea eax, [eax + ecx * 2 - 16] + sub edi, edx + + align 16 + convertloop: movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm3, [eax + 32] - lea eax, [eax + 48] - movdqa xmm2, xmm3 - palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} - pshufb xmm2, xmm6 - por xmm2, xmm7 - palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} - pshufb xmm0, xmm6 - movdqa [edx + 32], xmm2 - por xmm0, xmm7 - pshufb xmm1, xmm6 + lea eax, [eax - 16] + pshufb xmm0, xmm1 + sub ecx, 8 + movlpd qword ptr [edx], xmm0 + movhpd qword ptr [edx + edi], xmm0 + lea edx, [edx + 8] + jg convertloop + + pop edi + ret + } +} +#endif // HAS_MIRRORROW_UV_SSSE3 + +#ifdef HAS_ARGBMIRRORROW_SSSE3 + +// Shuffle table for reversing the bytes. +static const uvec8 kARGBShuffleMirror = { + 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u +}; + +__declspec(naked) __declspec(align(16)) +void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { +__asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + movdqa xmm5, kARGBShuffleMirror + lea eax, [eax - 16] + + align 16 + convertloop: + movdqa xmm0, [eax + ecx * 4] + pshufb xmm0, xmm5 + sub ecx, 4 movdqa [edx], xmm0 - por xmm1, xmm7 - palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} - pshufb xmm3, xmm6 - movdqa [edx + 16], xmm1 - por xmm3, xmm7 - movdqa [edx + 48], xmm3 - lea edx, [edx + 64] - sub ecx, 16 - ja convertloop + lea edx, [edx + 16] + jg convertloop ret } } +#endif // HAS_ARGBMIRRORROW_SSSE3 -__declspec(naked) -void FastConvertYUVToRGB32Row(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { +#ifdef HAS_SPLITUV_SSE2 +__declspec(naked) __declspec(align(16)) +void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { __asm { - pushad - mov edx, [esp + 32 + 4] - mov edi, [esp + 32 + 8] - mov esi, [esp + 32 + 12] - mov ebp, [esp + 32 + 16] - mov ecx, [esp + 32 + 20] - - convertloop : - movzx eax, byte ptr [edi] - lea edi, [edi + 1] - movzx ebx, byte ptr [esi] - lea esi, [esi + 1] - movq mm0, [_kCoefficientsRgbY + 2048 + 8 * eax] - movzx eax, byte ptr [edx] - paddsw mm0, [_kCoefficientsRgbY + 4096 + 8 * ebx] - movzx ebx, byte ptr [edx + 1] - movq mm1, [_kCoefficientsRgbY + 8 * eax] - lea edx, [edx + 2] - movq mm2, [_kCoefficientsRgbY + 8 * ebx] - paddsw mm1, mm0 - paddsw mm2, mm0 - psraw mm1, 6 - psraw mm2, 6 - packuswb mm1, mm2 - movntq [ebp], mm1 - lea ebp, [ebp + 8] - sub ecx, 2 - ja convertloop - - popad - ret - } -} - -__declspec(naked) -void FastConvertYUVToBGRARow(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { + push edi + mov eax, [esp + 4 + 4] // src_uv + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + align 16 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + pand xmm0, xmm5 // even bytes + pand xmm1, xmm5 + packuswb xmm0, xmm1 + psrlw xmm2, 8 // odd bytes + psrlw xmm3, 8 + packuswb xmm2, xmm3 + movdqa [edx], xmm0 + movdqa [edx + edi], xmm2 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} +#endif // HAS_SPLITUV_SSE2 + +#ifdef HAS_COPYROW_SSE2 +// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time. +__declspec(naked) __declspec(align(16)) +void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // count + sub edx, eax + + align 16 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa [eax + edx], xmm0 + movdqa [eax + edx + 16], xmm1 + lea eax, [eax + 32] + sub ecx, 32 + jg convertloop + ret + } +} +#endif // HAS_COPYROW_SSE2 + +#ifdef HAS_COPYROW_X86 +__declspec(naked) __declspec(align(16)) +void CopyRow_X86(const uint8* src, uint8* dst, int count) { + __asm { + mov eax, esi + mov edx, edi + mov esi, [esp + 4] // src + mov edi, [esp + 8] // dst + mov ecx, [esp + 12] // count + shr ecx, 2 + rep movsd + mov edi, edx + mov esi, eax + ret + } +} +#endif // HAS_COPYROW_X86 + +#ifdef HAS_SETROW_X86 +// SetRow8 writes 'count' bytes using a 32 bit value repeated. +__declspec(naked) __declspec(align(16)) +void SetRow8_X86(uint8* dst, uint32 v32, int count) { + __asm { + mov edx, edi + mov edi, [esp + 4] // dst + mov eax, [esp + 8] // v32 + mov ecx, [esp + 12] // count + shr ecx, 2 + rep stosd + mov edi, edx + ret + } +} + +// SetRow32 writes 'count' words using a 32 bit value repeated. +__declspec(naked) __declspec(align(16)) +void SetRows32_X86(uint8* dst, uint32 v32, int width, + int dst_stride, int height) { + __asm { + push esi + push edi + push ebp + mov edi, [esp + 12 + 4] // dst + mov eax, [esp + 12 + 8] // v32 + mov ebp, [esp + 12 + 12] // width + mov edx, [esp + 12 + 16] // dst_stride + mov esi, [esp + 12 + 20] // height + lea ecx, [ebp * 4] + sub edx, ecx // stride - width * 4 + + align 16 + convertloop: + mov ecx, ebp + rep stosd + add edi, edx + sub esi, 1 + jg convertloop + + pop ebp + pop edi + pop esi + ret + } +} +#endif // HAS_SETROW_X86 + +#ifdef HAS_YUY2TOYROW_SSE2 +__declspec(naked) __declspec(align(16)) +void YUY2ToYRow_SSE2(const uint8* src_yuy2, + uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] // src_yuy2 + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + align 16 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 // even bytes are Y + pand xmm1, xmm5 + packuswb xmm0, xmm1 + sub ecx, 16 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + align 16 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + esi] + movdqa xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + psrlw xmm0, 8 // YUYV -> UVUV + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + align 16 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + psrlw xmm0, 8 // YUYV -> UVUV + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, + uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] // src_yuy2 + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + align 16 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 // even bytes are Y + pand xmm1, xmm5 + packuswb xmm0, xmm1 + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + align 16 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + psrlw xmm0, 8 // YUYV -> UVUV + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + align 16 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + psrlw xmm0, 8 // YUYV -> UVUV + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void UYVYToYRow_SSE2(const uint8* src_uyvy, + uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] // src_uyvy + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // pix + + align 16 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + psrlw xmm0, 8 // odd bytes are Y + psrlw xmm1, 8 + packuswb xmm0, xmm1 + sub ecx, 16 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + align 16 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + esi] + movdqa xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + pand xmm0, xmm5 // UYVY -> UVUV + pand xmm1, xmm5 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void UYVYToUV422Row_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + align 16 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 // UYVY -> UVUV + pand xmm1, xmm5 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, + uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] // src_uyvy + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // pix + + align 16 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + psrlw xmm0, 8 // odd bytes are Y + psrlw xmm1, 8 + packuswb xmm0, xmm1 + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + align 16 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + pand xmm0, xmm5 // UYVY -> UVUV + pand xmm1, xmm5 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + align 16 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 // UYVY -> UVUV + pand xmm1, xmm5 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} +#endif // HAS_YUY2TOYROW_SSE2 + +#ifdef HAS_ARGBBLENDROW_SSE2 +// Blend 8 pixels at a time. +__declspec(naked) __declspec(align(16)) +void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm7, xmm7 // generate constant 1 + psrlw xmm7, 15 + pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff + psrlw xmm6, 8 + pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 + psllw xmm5, 8 + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pslld xmm4, 24 + + sub ecx, 1 + je convertloop1 // only 1 pixel? + jl convertloop1b + + // 1 pixel loop until destination pointer is aligned. + alignloop1: + test edx, 15 // aligned? + je alignloop1b + movd xmm3, [eax] + lea eax, [eax + 4] + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movd xmm2, [esi] // _r_b + psrlw xmm3, 8 // alpha + pshufhw xmm3, xmm3,0F5h // 8 alpha words + pshuflw xmm3, xmm3,0F5h + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movd xmm1, [esi] // _a_g + lea esi, [esi + 4] + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + sub ecx, 1 + movd [edx], xmm0 + lea edx, [edx + 4] + jge alignloop1 + + alignloop1b: + add ecx, 1 - 4 + jl convertloop4b + + // 4 pixel loop. + convertloop4: + movdqu xmm3, [eax] // src argb + lea eax, [eax + 16] + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movdqu xmm2, [esi] // _r_b + psrlw xmm3, 8 // alpha + pshufhw xmm3, xmm3,0F5h // 8 alpha words + pshuflw xmm3, xmm3,0F5h + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movdqu xmm1, [esi] // _a_g + lea esi, [esi + 16] + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jge convertloop4 + + convertloop4b: + add ecx, 4 - 1 + jl convertloop1b + + // 1 pixel loop. + convertloop1: + movd xmm3, [eax] // src argb + lea eax, [eax + 4] + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movd xmm2, [esi] // _r_b + psrlw xmm3, 8 // alpha + pshufhw xmm3, xmm3,0F5h // 8 alpha words + pshuflw xmm3, xmm3,0F5h + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movd xmm1, [esi] // _a_g + lea esi, [esi + 4] + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + sub ecx, 1 + movd [edx], xmm0 + lea edx, [edx + 4] + jge convertloop1 + + convertloop1b: + pop esi + ret + } +} +#endif // HAS_ARGBBLENDROW_SSE2 + +#ifdef HAS_ARGBBLENDROW_SSSE3 +// Shuffle table for isolating alpha. +static const uvec8 kShuffleAlpha = { + 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, + 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 +}; +// Same as SSE2, but replaces: +// psrlw xmm3, 8 // alpha +// pshufhw xmm3, xmm3,0F5h // 8 alpha words +// pshuflw xmm3, xmm3,0F5h +// with.. +// pshufb xmm3, kShuffleAlpha // alpha +// Blend 8 pixels at a time. + +__declspec(naked) __declspec(align(16)) +void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm7, xmm7 // generate constant 1 + psrlw xmm7, 15 + pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff + psrlw xmm6, 8 + pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 + psllw xmm5, 8 + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pslld xmm4, 24 + + sub ecx, 1 + je convertloop1 // only 1 pixel? + jl convertloop1b + + // 1 pixel loop until destination pointer is aligned. + alignloop1: + test edx, 15 // aligned? + je alignloop1b + movd xmm3, [eax] + lea eax, [eax + 4] + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movd xmm2, [esi] // _r_b + pshufb xmm3, kShuffleAlpha // alpha + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movd xmm1, [esi] // _a_g + lea esi, [esi + 4] + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + sub ecx, 1 + movd [edx], xmm0 + lea edx, [edx + 4] + jge alignloop1 + + alignloop1b: + add ecx, 1 - 4 + jl convertloop4b + + test eax, 15 // unaligned? + jne convertuloop4 + test esi, 15 // unaligned? + jne convertuloop4 + + // 4 pixel loop. + convertloop4: + movdqa xmm3, [eax] // src argb + lea eax, [eax + 16] + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movdqa xmm2, [esi] // _r_b + pshufb xmm3, kShuffleAlpha // alpha + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movdqa xmm1, [esi] // _a_g + lea esi, [esi + 16] + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jge convertloop4 + jmp convertloop4b + + // 4 pixel unaligned loop. + convertuloop4: + movdqu xmm3, [eax] // src argb + lea eax, [eax + 16] + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movdqu xmm2, [esi] // _r_b + pshufb xmm3, kShuffleAlpha // alpha + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movdqu xmm1, [esi] // _a_g + lea esi, [esi + 16] + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jge convertuloop4 + + convertloop4b: + add ecx, 4 - 1 + jl convertloop1b + + // 1 pixel loop. + convertloop1: + movd xmm3, [eax] // src argb + lea eax, [eax + 4] + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movd xmm2, [esi] // _r_b + pshufb xmm3, kShuffleAlpha // alpha + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movd xmm1, [esi] // _a_g + lea esi, [esi + 4] + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + sub ecx, 1 + movd [edx], xmm0 + lea edx, [edx + 4] + jge convertloop1 + + convertloop1b: + pop esi + ret + } +} +#endif // HAS_ARGBBLENDROW_SSSE3 + +#ifdef HAS_ARGBATTENUATE_SSE2 +// Attenuate 4 pixels at a time. +// Aligned to 16 bytes. +__declspec(naked) __declspec(align(16)) +void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { + __asm { + mov eax, [esp + 4] // src_argb0 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + sub edx, eax + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pslld xmm4, 24 + pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff + psrld xmm5, 8 + + align 16 + convertloop: + movdqa xmm0, [eax] // read 4 pixels + punpcklbw xmm0, xmm0 // first 2 + pshufhw xmm2, xmm0,0FFh // 8 alpha words + pshuflw xmm2, xmm2,0FFh + pmulhuw xmm0, xmm2 // rgb * a + movdqa xmm1, [eax] // read 4 pixels + punpckhbw xmm1, xmm1 // next 2 pixels + pshufhw xmm2, xmm1,0FFh // 8 alpha words + pshuflw xmm2, xmm2,0FFh + pmulhuw xmm1, xmm2 // rgb * a + movdqa xmm2, [eax] // alphas + psrlw xmm0, 8 + pand xmm2, xmm4 + psrlw xmm1, 8 + packuswb xmm0, xmm1 + pand xmm0, xmm5 // keep original alphas + por xmm0, xmm2 + sub ecx, 4 + movdqa [eax + edx], xmm0 + lea eax, [eax + 16] + jg convertloop + + ret + } +} +#endif // HAS_ARGBATTENUATE_SSE2 + +#ifdef HAS_ARGBATTENUATEROW_SSSE3 +// Shuffle table duplicating alpha. +static const uvec8 kShuffleAlpha0 = { + 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, +}; +static const uvec8 kShuffleAlpha1 = { + 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, + 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, +}; +__declspec(naked) __declspec(align(16)) +void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { __asm { - pushad - mov edx, [esp + 32 + 4] - mov edi, [esp + 32 + 8] - mov esi, [esp + 32 + 12] - mov ebp, [esp + 32 + 16] - mov ecx, [esp + 32 + 20] - - convertloop : - movzx eax, byte ptr [edi] - lea edi, [edi + 1] - movzx ebx, byte ptr [esi] - lea esi, [esi + 1] - movq mm0, [_kCoefficientsBgraY + 2048 + 8 * eax] - movzx eax, byte ptr [edx] - paddsw mm0, [_kCoefficientsBgraY + 4096 + 8 * ebx] - movzx ebx, byte ptr [edx + 1] - movq mm1, [_kCoefficientsBgraY + 8 * eax] - lea edx, [edx + 2] - movq mm2, [_kCoefficientsBgraY + 8 * ebx] - paddsw mm1, mm0 - paddsw mm2, mm0 - psraw mm1, 6 - psraw mm2, 6 - packuswb mm1, mm2 - movntq [ebp], mm1 - lea ebp, [ebp + 8] - sub ecx, 2 - ja convertloop - - popad - ret - } -} - -__declspec(naked) -void FastConvertYUVToABGRRow(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, + mov eax, [esp + 4] // src_argb0 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + sub edx, eax + pcmpeqb xmm3, xmm3 // generate mask 0xff000000 + pslld xmm3, 24 + movdqa xmm4, kShuffleAlpha0 + movdqa xmm5, kShuffleAlpha1 + + align 16 + convertloop: + movdqa xmm0, [eax] // read 4 pixels + pshufb xmm0, xmm4 // isolate first 2 alphas + movdqa xmm1, [eax] // read 4 pixels + punpcklbw xmm1, xmm1 // first 2 pixel rgbs + pmulhuw xmm0, xmm1 // rgb * a + movdqa xmm1, [eax] // read 4 pixels + pshufb xmm1, xmm5 // isolate next 2 alphas + movdqa xmm2, [eax] // read 4 pixels + punpckhbw xmm2, xmm2 // next 2 pixel rgbs + pmulhuw xmm1, xmm2 // rgb * a + movdqa xmm2, [eax] // mask original alpha + pand xmm2, xmm3 + psrlw xmm0, 8 + psrlw xmm1, 8 + packuswb xmm0, xmm1 + por xmm0, xmm2 // copy original alpha + sub ecx, 4 + movdqa [eax + edx], xmm0 + lea eax, [eax + 16] + jg convertloop + + ret + } +} +#endif // HAS_ARGBATTENUATEROW_SSSE3 + +#ifdef HAS_ARGBUNATTENUATEROW_SSE2 +// Unattenuate 4 pixels at a time. +// Aligned to 16 bytes. +__declspec(naked) __declspec(align(16)) +void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { __asm { - pushad - mov edx, [esp + 32 + 4] - mov edi, [esp + 32 + 8] - mov esi, [esp + 32 + 12] - mov ebp, [esp + 32 + 16] - mov ecx, [esp + 32 + 20] - - convertloop : - movzx eax, byte ptr [edi] - lea edi, [edi + 1] - movzx ebx, byte ptr [esi] - lea esi, [esi + 1] - movq mm0, [_kCoefficientsAbgrY + 2048 + 8 * eax] - movzx eax, byte ptr [edx] - paddsw mm0, [_kCoefficientsAbgrY + 4096 + 8 * ebx] - movzx ebx, byte ptr [edx + 1] - movq mm1, [_kCoefficientsAbgrY + 8 * eax] - lea edx, [edx + 2] - movq mm2, [_kCoefficientsAbgrY + 8 * ebx] - paddsw mm1, mm0 - paddsw mm2, mm0 - psraw mm1, 6 - psraw mm2, 6 - packuswb mm1, mm2 - movntq [ebp], mm1 - lea ebp, [ebp + 8] - sub ecx, 2 - ja convertloop - - popad - ret - } -} - -__declspec(naked) -void FastConvertYUV444ToRGB32Row(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { - __asm { - pushad - mov edx, [esp + 32 + 4] // Y - mov edi, [esp + 32 + 8] // U - mov esi, [esp + 32 + 12] // V - mov ebp, [esp + 32 + 16] // rgb - mov ecx, [esp + 32 + 20] // width - - convertloop : - movzx eax, byte ptr [edi] - lea edi, [edi + 1] - movzx ebx, byte ptr [esi] - lea esi, [esi + 1] - movq mm0, [_kCoefficientsRgbY + 2048 + 8 * eax] - movzx eax, byte ptr [edx] - paddsw mm0, [_kCoefficientsRgbY + 4096 + 8 * ebx] - lea edx, [edx + 1] - paddsw mm0, [_kCoefficientsRgbY + 8 * eax] - psraw mm0, 6 - packuswb mm0, mm0 - movd [ebp], mm0 - lea ebp, [ebp + 4] - sub ecx, 1 - ja convertloop - - popad - ret - } -} - -__declspec(naked) -void FastConvertYToRGB32Row(const uint8* y_buf, - uint8* rgb_buf, - int width) { - __asm { - push ebx - mov eax, [esp + 4 + 4] // Y - mov edx, [esp + 4 + 8] // rgb - mov ecx, [esp + 4 + 12] // width - - convertloop : - movzx ebx, byte ptr [eax] - movq mm0, [_kCoefficientsRgbY + 8 * ebx] - psraw mm0, 6 - movzx ebx, byte ptr [eax + 1] - movq mm1, [_kCoefficientsRgbY + 8 * ebx] - psraw mm1, 6 - packuswb mm0, mm1 - lea eax, [eax + 2] - movq [edx], mm0 - lea edx, [edx + 8] - sub ecx, 2 - ja convertloop + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb0 + mov edx, [esp + 8 + 8] // dst_argb + mov ecx, [esp + 8 + 12] // width + sub edx, eax + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pslld xmm4, 24 - pop ebx + align 16 + convertloop: + movdqa xmm0, [eax] // read 4 pixels + movzx esi, byte ptr [eax + 3] // first alpha + movzx edi, byte ptr [eax + 7] // second alpha + punpcklbw xmm0, xmm0 // first 2 + movd xmm2, dword ptr fixed_invtbl8[esi * 4] + movd xmm3, dword ptr fixed_invtbl8[edi * 4] + pshuflw xmm2, xmm2,0C0h // first 4 inv_alpha words + pshuflw xmm3, xmm3,0C0h // next 4 inv_alpha words + movlhps xmm2, xmm3 + pmulhuw xmm0, xmm2 // rgb * a + + movdqa xmm1, [eax] // read 4 pixels + movzx esi, byte ptr [eax + 11] // third alpha + movzx edi, byte ptr [eax + 15] // forth alpha + punpckhbw xmm1, xmm1 // next 2 + movd xmm2, dword ptr fixed_invtbl8[esi * 4] + movd xmm3, dword ptr fixed_invtbl8[edi * 4] + pshuflw xmm2, xmm2,0C0h // first 4 inv_alpha words + pshuflw xmm3, xmm3,0C0h // next 4 inv_alpha words + movlhps xmm2, xmm3 + pmulhuw xmm1, xmm2 // rgb * a + + movdqa xmm2, [eax] // alphas + pand xmm2, xmm4 + packuswb xmm0, xmm1 + por xmm0, xmm2 + sub ecx, 4 + movdqa [eax + edx], xmm0 + lea eax, [eax + 16] + jg convertloop + pop edi + pop esi ret } } +#endif // HAS_ARGBUNATTENUATEROW_SSE2 -#endif +#ifdef HAS_ARGBGRAYROW_SSSE3 +// Constant for ARGB color to gray scale: 0.11 * B + 0.59 * G + 0.30 * R +static const vec8 kARGBToGray = { + 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0 +}; + +// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. +__declspec(naked) __declspec(align(16)) +void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_argb */ + mov ecx, [esp + 12] /* width */ + movdqa xmm4, kARGBToGray + sub edx, eax + + align 16 + convertloop: + movdqa xmm0, [eax] // G + movdqa xmm1, [eax + 16] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + phaddw xmm0, xmm1 + psrlw xmm0, 7 + packuswb xmm0, xmm0 // 8 G bytes + movdqa xmm2, [eax] // A + movdqa xmm3, [eax + 16] + psrld xmm2, 24 + psrld xmm3, 24 + packuswb xmm2, xmm3 + packuswb xmm2, xmm2 // 8 A bytes + movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA + punpcklbw xmm0, xmm0 // 8 GG words + punpcklbw xmm3, xmm2 // 8 GA words + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm3 // GGGA first 4 + punpckhwd xmm1, xmm3 // GGGA next 4 + sub ecx, 8 + movdqa [eax + edx], xmm0 + movdqa [eax + edx + 16], xmm1 + lea eax, [eax + 32] + jg convertloop + ret + } +} +#endif // HAS_ARGBGRAYROW_SSSE3 + +#ifdef HAS_ARGBSEPIAROW_SSSE3 +// b = (r * 35 + g * 68 + b * 17) >> 7 +// g = (r * 45 + g * 88 + b * 22) >> 7 +// r = (r * 50 + g * 98 + b * 24) >> 7 +// Constant for ARGB color to sepia tone. +static const vec8 kARGBToSepiaB = { + 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 +}; + +static const vec8 kARGBToSepiaG = { + 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 +}; + +static const vec8 kARGBToSepiaR = { + 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 +}; + +// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. +__declspec(naked) __declspec(align(16)) +void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { + __asm { + mov eax, [esp + 4] /* dst_argb */ + mov ecx, [esp + 8] /* width */ + movdqa xmm2, kARGBToSepiaB + movdqa xmm3, kARGBToSepiaG + movdqa xmm4, kARGBToSepiaR + + align 16 + convertloop: + movdqa xmm0, [eax] // B + movdqa xmm6, [eax + 16] + pmaddubsw xmm0, xmm2 + pmaddubsw xmm6, xmm2 + phaddw xmm0, xmm6 + psrlw xmm0, 7 + packuswb xmm0, xmm0 // 8 B values + movdqa xmm5, [eax] // G + movdqa xmm1, [eax + 16] + pmaddubsw xmm5, xmm3 + pmaddubsw xmm1, xmm3 + phaddw xmm5, xmm1 + psrlw xmm5, 7 + packuswb xmm5, xmm5 // 8 G values + punpcklbw xmm0, xmm5 // 8 BG values + movdqa xmm5, [eax] // R + movdqa xmm1, [eax + 16] + pmaddubsw xmm5, xmm4 + pmaddubsw xmm1, xmm4 + phaddw xmm5, xmm1 + psrlw xmm5, 7 + packuswb xmm5, xmm5 // 8 R values + movdqa xmm6, [eax] // A + movdqa xmm1, [eax + 16] + psrld xmm6, 24 + psrld xmm1, 24 + packuswb xmm6, xmm1 + packuswb xmm6, xmm6 // 8 A values + punpcklbw xmm5, xmm6 // 8 RA values + movdqa xmm1, xmm0 // Weave BG, RA together + punpcklwd xmm0, xmm5 // BGRA first 4 + punpckhwd xmm1, xmm5 // BGRA next 4 + sub ecx, 8 + movdqa [eax], xmm0 + movdqa [eax + 16], xmm1 + lea eax, [eax + 32] + jg convertloop + ret + } +} +#endif // HAS_ARGBSEPIAROW_SSSE3 + +#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 +// Tranform 8 ARGB pixels (32 bytes) with color matrix. +// Same as Sepia except matrix is provided. +// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R +// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. +__declspec(naked) __declspec(align(16)) +void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb, + int width) { + __asm { + mov eax, [esp + 4] /* dst_argb */ + mov edx, [esp + 8] /* matrix_argb */ + mov ecx, [esp + 12] /* width */ + movd xmm2, [edx] + movd xmm3, [edx + 4] + movd xmm4, [edx + 8] + pshufd xmm2, xmm2, 0 + pshufd xmm3, xmm3, 0 + pshufd xmm4, xmm4, 0 + + align 16 + convertloop: + movdqa xmm0, [eax] // B + movdqa xmm6, [eax + 16] + pmaddubsw xmm0, xmm2 + pmaddubsw xmm6, xmm2 + movdqa xmm5, [eax] // G + movdqa xmm1, [eax + 16] + pmaddubsw xmm5, xmm3 + pmaddubsw xmm1, xmm3 + phaddsw xmm0, xmm6 // B + phaddsw xmm5, xmm1 // G + psraw xmm0, 7 // B + psraw xmm5, 7 // G + packuswb xmm0, xmm0 // 8 B values + packuswb xmm5, xmm5 // 8 G values + punpcklbw xmm0, xmm5 // 8 BG values + movdqa xmm5, [eax] // R + movdqa xmm1, [eax + 16] + pmaddubsw xmm5, xmm4 + pmaddubsw xmm1, xmm4 + phaddsw xmm5, xmm1 + psraw xmm5, 7 + packuswb xmm5, xmm5 // 8 R values + movdqa xmm6, [eax] // A + movdqa xmm1, [eax + 16] + psrld xmm6, 24 + psrld xmm1, 24 + packuswb xmm6, xmm1 + packuswb xmm6, xmm6 // 8 A values + movdqa xmm1, xmm0 // Weave BG, RA together + punpcklbw xmm5, xmm6 // 8 RA values + punpcklwd xmm0, xmm5 // BGRA first 4 + punpckhwd xmm1, xmm5 // BGRA next 4 + sub ecx, 8 + movdqa [eax], xmm0 + movdqa [eax + 16], xmm1 + lea eax, [eax + 32] + jg convertloop + ret + } +} +#endif // HAS_ARGBCOLORMATRIXROW_SSSE3 + +#ifdef HAS_ARGBCOLORTABLEROW_X86 +// Tranform ARGB pixels with color table. +__declspec(naked) __declspec(align(16)) +void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, + int width) { + __asm { + push ebx + push esi + push edi + push ebp + mov eax, [esp + 16 + 4] /* dst_argb */ + mov edi, [esp + 16 + 8] /* table_argb */ + mov ecx, [esp + 16 + 12] /* width */ + xor ebx, ebx + xor edx, edx + + align 16 + convertloop: + mov ebp, dword ptr [eax] // BGRA + mov esi, ebp + and ebp, 255 + shr esi, 8 + and esi, 255 + mov bl, [edi + ebp * 4 + 0] // B + mov dl, [edi + esi * 4 + 1] // G + mov ebp, dword ptr [eax] // BGRA + mov esi, ebp + shr ebp, 16 + shr esi, 24 + and ebp, 255 + mov [eax], bl + mov [eax + 1], dl + mov bl, [edi + ebp * 4 + 2] // R + mov dl, [edi + esi * 4 + 3] // A + mov [eax + 2], bl + mov [eax + 3], dl + lea eax, [eax + 4] + sub ecx, 1 + jg convertloop + pop ebp + pop edi + pop esi + pop ebx + ret + } +} +#endif // HAS_ARGBCOLORTABLEROW_X86 + +#ifdef HAS_ARGBQUANTIZEROW_SSE2 +// Quantize 4 ARGB pixels (16 bytes). +// Aligned to 16 bytes. +__declspec(naked) __declspec(align(16)) +void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, + int interval_offset, int width) { + __asm { + mov eax, [esp + 4] /* dst_argb */ + movd xmm2, [esp + 8] /* scale */ + movd xmm3, [esp + 12] /* interval_size */ + movd xmm4, [esp + 16] /* interval_offset */ + mov ecx, [esp + 20] /* width */ + pshuflw xmm2, xmm2, 040h + pshufd xmm2, xmm2, 044h + pshuflw xmm3, xmm3, 040h + pshufd xmm3, xmm3, 044h + pshuflw xmm4, xmm4, 040h + pshufd xmm4, xmm4, 044h + pxor xmm5, xmm5 // constant 0 + pcmpeqb xmm6, xmm6 // generate mask 0xff000000 + pslld xmm6, 24 + + align 16 + convertloop: + movdqa xmm0, [eax] // read 4 pixels + punpcklbw xmm0, xmm5 // first 2 pixels + pmulhuw xmm0, xmm2 // pixel * scale >> 16 + movdqa xmm1, [eax] // read 4 pixels + punpckhbw xmm1, xmm5 // next 2 pixels + pmulhuw xmm1, xmm2 + pmullw xmm0, xmm3 // * interval_size + movdqa xmm7, [eax] // read 4 pixels + pmullw xmm1, xmm3 + pand xmm7, xmm6 // mask alpha + paddw xmm0, xmm4 // + interval_size / 2 + paddw xmm1, xmm4 + packuswb xmm0, xmm1 + por xmm0, xmm7 + sub ecx, 4 + movdqa [eax], xmm0 + lea eax, [eax + 16] + jg convertloop + ret + } +} +#endif // HAS_ARGBQUANTIZEROW_SSE2 + +#ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2 +// Consider float CumulativeSum. +// Consider calling CumulativeSum one row at time as needed. +// Consider circular CumulativeSum buffer of radius * 2 + 1 height. +// Convert cumulative sum for an area to an average for 1 pixel. +// topleft is pointer to top left of CumulativeSum buffer for area. +// botleft is pointer to bottom left of CumulativeSum buffer. +// width is offset from left to right of area in CumulativeSum buffer measured +// in number of ints. +// area is the number of pixels in the area being averaged. +// dst points to pixel to store result to. +// count is number of averaged pixels to produce. +// Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte +// aligned. +void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft, + int width, int area, uint8* dst, int count) { + __asm { + mov eax, topleft // eax topleft + mov esi, botleft // esi botleft + mov edx, width + movd xmm4, area + mov edi, dst + mov ecx, count + cvtdq2ps xmm4, xmm4 + rcpss xmm4, xmm4 // 1.0f / area + pshufd xmm4, xmm4, 0 + sub ecx, 4 + jl l4b + + // 4 pixel loop + align 4 + l4: + // top left + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + + // - top right + psubd xmm0, [eax + edx * 4] + psubd xmm1, [eax + edx * 4 + 16] + psubd xmm2, [eax + edx * 4 + 32] + psubd xmm3, [eax + edx * 4 + 48] + lea eax, [eax + 64] + + // - bottom left + psubd xmm0, [esi] + psubd xmm1, [esi + 16] + psubd xmm2, [esi + 32] + psubd xmm3, [esi + 48] + + // + bottom right + paddd xmm0, [esi + edx * 4] + paddd xmm1, [esi + edx * 4 + 16] + paddd xmm2, [esi + edx * 4 + 32] + paddd xmm3, [esi + edx * 4 + 48] + lea esi, [esi + 64] + + cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area + cvtdq2ps xmm1, xmm1 + mulps xmm0, xmm4 + mulps xmm1, xmm4 + cvtdq2ps xmm2, xmm2 + cvtdq2ps xmm3, xmm3 + mulps xmm2, xmm4 + mulps xmm3, xmm4 + cvtps2dq xmm0, xmm0 + cvtps2dq xmm1, xmm1 + cvtps2dq xmm2, xmm2 + cvtps2dq xmm3, xmm3 + packssdw xmm0, xmm1 + packssdw xmm2, xmm3 + packuswb xmm0, xmm2 + movdqu [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 4 + jge l4 + + l4b: + add ecx, 4 - 1 + jl l1b + + // 1 pixel loop + align 4 + l1: + movdqa xmm0, [eax] + psubd xmm0, [eax + edx * 4] + lea eax, [eax + 16] + psubd xmm0, [esi] + paddd xmm0, [esi + edx * 4] + lea esi, [esi + 16] + cvtdq2ps xmm0, xmm0 + mulps xmm0, xmm4 + cvtps2dq xmm0, xmm0 + packssdw xmm0, xmm0 + packuswb xmm0, xmm0 + movd dword ptr [edi], xmm0 + lea edi, [edi + 4] + sub ecx, 1 + jge l1 + l1b: + } +} +#endif // HAS_CUMULATIVESUMTOAVERAGE_SSE2 + +#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 +// Creates a table of cumulative sums where each value is a sum of all values +// above and to the left of the value. +void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, + const int32* previous_cumsum, int width) { + __asm { + mov eax, row + mov edx, cumsum + mov esi, previous_cumsum + mov ecx, width + sub esi, edx + pxor xmm0, xmm0 + pxor xmm1, xmm1 + + sub ecx, 4 + jl l4b + test edx, 15 + jne l4b + + // 4 pixel loop + align 4 + l4: + movdqu xmm2, [eax] // 4 argb pixels 16 bytes. + lea eax, [eax + 16] + movdqa xmm4, xmm2 + + punpcklbw xmm2, xmm1 + movdqa xmm3, xmm2 + punpcklwd xmm2, xmm1 + punpckhwd xmm3, xmm1 + + punpckhbw xmm4, xmm1 + movdqa xmm5, xmm4 + punpcklwd xmm4, xmm1 + punpckhwd xmm5, xmm1 + + paddd xmm0, xmm2 + movdqa xmm2, [edx + esi] // previous row above. + paddd xmm2, xmm0 + + paddd xmm0, xmm3 + movdqa xmm3, [edx + esi + 16] + paddd xmm3, xmm0 + paddd xmm0, xmm4 + movdqa xmm4, [edx + esi + 32] + paddd xmm4, xmm0 + + paddd xmm0, xmm5 + movdqa xmm5, [edx + esi + 48] + paddd xmm5, xmm0 + + movdqa [edx], xmm2 + movdqa [edx + 16], xmm3 + movdqa [edx + 32], xmm4 + movdqa [edx + 48], xmm5 + + lea edx, [edx + 64] + sub ecx, 4 + jge l4 + + l4b: + add ecx, 4 - 1 + jl l1b + + // 1 pixel loop + align 4 + l1: + movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes. + lea eax, [eax + 4] + punpcklbw xmm2, xmm1 + punpcklwd xmm2, xmm1 + paddd xmm0, xmm2 + movdqu xmm2, [edx + esi] + paddd xmm2, xmm0 + movdqu [edx], xmm2 + lea edx, [edx + 16] + sub ecx, 1 + jge l1 + + l1b: + } +} +#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 + +#ifdef HAS_ARGBSHADE_SSE2 +// Shade 4 pixels at a time by specified value. +// Aligned to 16 bytes. +__declspec(naked) __declspec(align(16)) +void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + movd xmm2, [esp + 16] // value + sub edx, eax + punpcklbw xmm2, xmm2 + punpcklqdq xmm2, xmm2 + + align 16 + convertloop: + movdqa xmm0, [eax] // read 4 pixels + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm0 // first 2 + punpckhbw xmm1, xmm1 // next 2 + pmulhuw xmm0, xmm2 // argb * value + pmulhuw xmm1, xmm2 // argb * value + psrlw xmm0, 8 + psrlw xmm1, 8 + packuswb xmm0, xmm1 + sub ecx, 4 + movdqa [eax + edx], xmm0 + lea eax, [eax + 16] + jg convertloop + + ret + } +} +#endif // HAS_ARGBSHADE_SSE2 + +#ifdef HAS_ARGBAFFINEROW_SSE2 +// Copy ARGB pixels from source image with slope to a row of destination. +__declspec(naked) __declspec(align(16)) +LIBYUV_API +void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, + uint8* dst_argb, const float* uv_dudv, int width) { + __asm { + push esi + push edi + mov eax, [esp + 12] // src_argb + mov esi, [esp + 16] // stride + mov edx, [esp + 20] // dst_argb + mov ecx, [esp + 24] // pointer to uv_dudv + movq xmm2, qword ptr [ecx] // uv + movq xmm7, qword ptr [ecx + 8] // dudv + mov ecx, [esp + 28] // width + shl esi, 16 // 4, stride + add esi, 4 + movd xmm5, esi + sub ecx, 4 + jl l4b + + // setup for 4 pixel loop + pshufd xmm7, xmm7, 0x44 // dup dudv + pshufd xmm5, xmm5, 0 // dup 4, stride + movdqa xmm0, xmm2 // x0, y0, x1, y1 + addps xmm0, xmm7 + movlhps xmm2, xmm0 + movdqa xmm4, xmm7 + addps xmm4, xmm4 // dudv *= 2 + movdqa xmm3, xmm2 // x2, y2, x3, y3 + addps xmm3, xmm4 + addps xmm4, xmm4 // dudv *= 4 + + // 4 pixel loop + align 4 + l4: + cvttps2dq xmm0, xmm2 // x, y float to int first 2 + cvttps2dq xmm1, xmm3 // x, y float to int next 2 + packssdw xmm0, xmm1 // x, y as 8 shorts + pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride. + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // shift right + movd edi, xmm0 + pshufd xmm0, xmm0, 0x39 // shift right + movd xmm1, [eax + esi] // read pixel 0 + movd xmm6, [eax + edi] // read pixel 1 + punpckldq xmm1, xmm6 // combine pixel 0 and 1 + addps xmm2, xmm4 // x, y += dx, dy first 2 + movq qword ptr [edx], xmm1 + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // shift right + movd edi, xmm0 + movd xmm6, [eax + esi] // read pixel 2 + movd xmm0, [eax + edi] // read pixel 3 + punpckldq xmm6, xmm0 // combine pixel 2 and 3 + addps xmm3, xmm4 // x, y += dx, dy next 2 + sub ecx, 4 + movq qword ptr 8[edx], xmm6 + lea edx, [edx + 16] + jge l4 + + l4b: + add ecx, 4 - 1 + jl l1b + + // 1 pixel loop + align 4 + l1: + cvttps2dq xmm0, xmm2 // x, y float to int + packssdw xmm0, xmm0 // x, y as shorts + pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride + addps xmm2, xmm7 // x, y += dx, dy + movd esi, xmm0 + movd xmm0, [eax + esi] // copy a pixel + sub ecx, 1 + movd [edx], xmm0 + lea edx, [edx + 4] + jge l1 + l1b: + pop edi + pop esi + ret + } +} +#endif // HAS_ARGBAFFINEROW_SSE2 + +// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version. +__declspec(naked) __declspec(align(16)) +void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr + mov edx, [esp + 8 + 12] // src_stride + mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + sub edi, esi + shr eax, 1 + cmp eax, 0 + je xloop1 + cmp eax, 64 + je xloop2 + movd xmm0, eax // high fraction 0..127 + neg eax + add eax, 128 + movd xmm5, eax // low fraction 128..1 + punpcklbw xmm5, xmm0 + punpcklwd xmm5, xmm5 + pshufd xmm5, xmm5, 0 + + align 16 + xloop: + movdqa xmm0, [esi] + movdqa xmm2, [esi + edx] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm2 + punpckhbw xmm1, xmm2 + pmaddubsw xmm0, xmm5 + pmaddubsw xmm1, xmm5 + psrlw xmm0, 7 + psrlw xmm1, 7 + packuswb xmm0, xmm1 + sub ecx, 4 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop + + pop edi + pop esi + ret + + align 16 + xloop1: + movdqa xmm0, [esi] + sub ecx, 4 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop1 + + pop edi + pop esi + ret + + align 16 + xloop2: + movdqa xmm0, [esi] + pavgb xmm0, [esi + edx] + sub ecx, 4 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop2 + + pop edi + pop esi + ret + } +} + +#endif // _M_IX86 + +#ifdef __cplusplus } // extern "C" +} // namespace libyuv +#endif diff --git a/files/source/scale.cc b/files/source/scale.cc index d3b7d333..38910c91 100644 --- a/files/source/scale.cc +++ b/files/source/scale.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -12,34 +12,37 @@ #include <assert.h> #include <string.h> +#include <stdlib.h> // For getenv() #include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" // For CopyPlane +#include "libyuv/row.h" -#if defined(_MSC_VER) -#define ALIGN16(var) __declspec(align(16)) var -#else -#define ALIGN16(var) var __attribute__((aligned(16))) +#ifdef __cplusplus +namespace libyuv { +extern "C" { #endif -// Note: A Neon reference manual -// http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/CJAJIIGG.html +// Bilinear SSE2 is disabled. +#define SSE2_DISABLED 1 + // Note: Some SSE2 reference manuals // cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf -namespace libyuv { - // Set the following flag to true to revert to only // using the reference implementation ScalePlaneBox(), and // NOT the optimized versions. Useful for debugging and // when comparing the quality of the resulting YUV planes // as produced by the optimized and non-optimized versions. - static bool use_reference_impl_ = false; +LIBYUV_API void SetUseReferenceImpl(bool use) { use_reference_impl_ = use; } +// ScaleRowDown2Int also used by planar functions + /** * NEON downscalers with interpolation. * @@ -47,126 +50,53 @@ void SetUseReferenceImpl(bool use) { * */ -#if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED) +#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__) #define HAS_SCALEROWDOWN2_NEON -void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */, - uint8* dst, int dst_width) { - __asm__ volatile - ( - "1:\n" - "vld2.u8 {q0,q1}, [%0]! \n" // load even pixels into q0, odd into q1 - "vst1.u8 {q0}, [%1]! \n" // store even pixels - "subs %2, %2, #16 \n" // 16 processed per loop - "bhi 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "q0", "q1" // Clobber List - ); -} +// Note - not static due to reuse in convert for 444 to 420. +void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width); -void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride, - uint8* dst, int dst_width) { - __asm__ volatile - ( - "mov r4, #2 \n" // rounding constant - "add %1, %0 \n" // change the stride to row 2 pointer - "vdup.16 q4, r4 \n" - "1:\n" - "vld1.u8 {q0,q1}, [%0]! \n" // load row 1 and post increment - "vld1.u8 {q2,q3}, [%1]! \n" // load row 2 and post increment - "vpaddl.u8 q0, q0 \n" // row 1 add adjacent - "vpaddl.u8 q1, q1 \n" - "vpadal.u8 q0, q2 \n" // row 2 add adjacent, add row 1 to row 2 - "vpadal.u8 q1, q3 \n" - "vadd.u16 q0, q4 \n" // rounding - "vadd.u16 q1, q4 \n" - "vshrn.u16 d0, q0, #2 \n" // downshift and pack - "vshrn.u16 d1, q1, #2 \n" - "vst1.u8 {q0}, [%2]! \n" - "subs %3, %3, #16 \n" // 16 processed per loop - "bhi 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_stride), // %1 - "+r"(dst), // %2 - "+r"(dst_width) // %3 - : - : "r4", "q0", "q1", "q2", "q3", "q4" // Clobber List - ); -} +void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); #define HAS_SCALEROWDOWN4_NEON -// Expecting widths on arm devices to be smaller. Went with 8x4 blocks -// to get most coverage. Look to back and evaluate 16x4 blocks with -// handling of leftovers. -static void ScaleRowDown4_NEON(const uint8* src_ptr, int /* src_stride */, - uint8* dst_ptr, int dst_width) { - __asm__ volatile - ( - "mov r4, #4 \n" - "1: \n" - "vld1.u8 {d0[0]}, [%0],r4 \n" // load up only 2 pixels of data to - "vld1.u8 {d0[1]}, [%0],r4 \n" // represent the entire 8x4 block - - "vst1.u16 {d0[0]}, [%1]! \n" - - "subs %2, #2 \n" // dst_width -= 2 - "bhi 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "r4", "q0", "q1", "memory", "cc" - ); -} +void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); -static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - __asm__ volatile - ( - "1: \n" - "mov r4, %0 \n" - "vld1.u8 {d0}, [r4],%3 \n" // load up 8x4 block of input data - "vld1.u8 {d1}, [r4],%3 \n" - "vld1.u8 {d2}, [r4],%3 \n" - "vld1.u8 {d3}, [r4] \n" - - // data is loaded up int q0 and q1 - // q0 = a00 a01 a02 a03 b00 b01 b02 b03 a10 a11 a12 a13 b10 b11 b12 b13 - // q1 = a20 a21 a22 a23 b20 b21 b22 b23 a20 a21 a22 a23 b20 b21 b22 b23 - // q0 = a00+a01 a02+a03 b00+b01 b02+b03 a10+a11 a12+a13 b10+b11 b12+b13 - "vpaddl.u8 q0, q0 \n" - - // d0 = a00+a01+a20+a21 a02+a03+a22+a23 b00+b01+b20+b21 b02+b03+b22+b23 - // d1 = a10+a11+a20+a21 a12+a13+a22+a23 b10+b11+b20+b21 b12+b13+b22+b23 - "vpadal.u8 q0, q1 \n" - - // d0 = a00+a01+a20+a21+a02+a03+a22+a23 b00+b01+b20+b21+b02+b03+b22+b23 - // d1 = a10+a11+a20+a21+a12+a13+a22+a23 b10+b11+b20+b21+b12+b13+b22+b23 - "vpaddl.u16 q0, q0 \n" - - - // d0 = a00+a01+a20+a21+a02+a03+a22+a23+a10+a11+a20+a21+a12+a13+a22+a23 - // b00+b01+b20+b21+b02+b03+b22+b23+b10+b11+b20+b21+b12+b13+b22+b23 - "vadd.u32 d0, d1 \n" - - "vrshr.u32 d0, d0, #4 \n" // divide by 16 w/rounding - - "vst1.u8 {d0[0]}, [%1]! \n" - "vst1.u8 {d0[4]}, [%1]! \n" - - "add %0, #8 \n" // move src pointer to next 8 pixels - "subs %2, #2 \n" // dst_width -= 2 - "bhi 1b \n" - - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride) // %3 - : "r4", "q0", "q1", "memory", "cc" - ); -} +#define HAS_SCALEROWDOWN34_NEON +// Down scale from 4 to 3 pixels. Use the neon multilane read/write +// to load up the every 4th pixel into a 4 different registers. +// Point samples 32 pixels to 24 pixels. +void ScaleRowDown34_NEON(const uint8* src_ptr, + ptrdiff_t /* src_stride */, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + +#define HAS_SCALEROWDOWN38_NEON +// 32 -> 12 +void ScaleRowDown38_NEON(const uint8* src_ptr, + ptrdiff_t /* src_stride */, + uint8* dst_ptr, int dst_width); +// 32x3 -> 12x1 +void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +// 32x2 -> 12x1 +void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +// 16x2 -> 16x1 +#define HAS_SCALEFILTERROWS_NEON +void ScaleFilterRows_NEON(uint8* dst_ptr, + const uint8* src_ptr, ptrdiff_t src_stride, + int dst_width, int source_y_fraction); /** * SSE2 downscalers with interpolation. @@ -175,137 +105,141 @@ static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride, * */ -// Constants for SSE2 code -#elif (defined(WIN32) || defined(__i386__) || defined(__x86_64__)) && \ - !defined(COVERAGE_ENABLED) && !TARGET_IPHONE_SIMULATOR -#if defined(_MSC_VER) -#define TALIGN16(t, var) __declspec(align(16)) t _ ## var -#elif defined(OSX) -#define TALIGN16(t, var) t var __attribute__((aligned(16))) + +// Constants for SSSE3 code +#elif !defined(YUV_DISABLE_ASM) && \ + (defined(_M_IX86) || defined(__i386__) || defined(__x86_64__)) + +// GCC 4.2 on OSX has link error when passing static or const to inline. +// TODO(fbarchard): Use static const when gcc 4.2 support is dropped. +#ifdef __APPLE__ +#define CONST #else -#define TALIGN16(t, var) t _ ## var __attribute__((aligned(16))) +#define CONST static const #endif // Offsets for source bytes 0 to 9 -extern "C" TALIGN16(const uint8, shuf0[16]) = +CONST uvec8 kShuf0 = { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. -extern "C" TALIGN16(const uint8, shuf1[16]) = +CONST uvec8 kShuf1 = { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. -extern "C" TALIGN16(const uint8, shuf2[16]) = +CONST uvec8 kShuf2 = { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; // Offsets for source bytes 0 to 10 -extern "C" TALIGN16(const uint8, shuf01[16]) = +CONST uvec8 kShuf01 = { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. -extern "C" TALIGN16(const uint8, shuf11[16]) = +CONST uvec8 kShuf11 = { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. -extern "C" TALIGN16(const uint8, shuf21[16]) = +CONST uvec8 kShuf21 = { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; // Coefficients for source bytes 0 to 10 -extern "C" TALIGN16(const uint8, madd01[16]) = +CONST uvec8 kMadd01 = { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; // Coefficients for source bytes 10 to 21 -extern "C" TALIGN16(const uint8, madd11[16]) = +CONST uvec8 kMadd11 = { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; // Coefficients for source bytes 21 to 31 -extern "C" TALIGN16(const uint8, madd21[16]) = +CONST uvec8 kMadd21 = { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; // Coefficients for source bytes 21 to 31 -extern "C" TALIGN16(const int16, round34[8]) = +CONST vec16 kRound34 = { 2, 2, 2, 2, 2, 2, 2, 2 }; -extern "C" TALIGN16(const uint8, shuf38a[16]) = +CONST uvec8 kShuf38a = { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; -extern "C" TALIGN16(const uint8, shuf38b[16]) = +CONST uvec8 kShuf38b = { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; // Arrange words 0,3,6 into 0,1,2 -extern "C" TALIGN16(const uint8, shufac0[16]) = +CONST uvec8 kShufAc = { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; // Arrange words 0,3,6 into 3,4,5 -extern "C" TALIGN16(const uint8, shufac3[16]) = +CONST uvec8 kShufAc3 = { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; // Scaling values for boxes of 3x3 and 2x3 -extern "C" TALIGN16(const uint16, scaleac3[8]) = +CONST uvec16 kScaleAc33 = { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; // Arrange first value for pixels 0,1,2,3,4,5 -extern "C" TALIGN16(const uint8, shufab0[16]) = +CONST uvec8 kShufAb0 = { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; // Arrange second value for pixels 0,1,2,3,4,5 -extern "C" TALIGN16(const uint8, shufab1[16]) = +CONST uvec8 kShufAb1 = { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; // Arrange third value for pixels 0,1,2,3,4,5 -extern "C" TALIGN16(const uint8, shufab2[16]) = +CONST uvec8 kShufAb2 = { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; // Scaling values for boxes of 3x2 and 2x2 -extern "C" TALIGN16(const uint16, scaleab2[8]) = +CONST uvec16 kScaleAb2 = { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; #endif -#if defined(WIN32) && !defined(COVERAGE_ENABLED) +#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) #define HAS_SCALEROWDOWN2_SSE2 // Reads 32 pixels, throws half away and writes 16 pixels. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. -__declspec(naked) -static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, +__declspec(naked) __declspec(align(16)) +static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { mov eax, [esp + 4] // src_ptr // src_stride ignored mov edx, [esp + 12] // dst_ptr mov ecx, [esp + 16] // dst_width - pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff - psrlw xmm7, 8 + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + align 16 wloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] lea eax, [eax + 32] - pand xmm0, xmm7 - pand xmm1, xmm7 + pand xmm0, xmm5 + pand xmm1, xmm5 packuswb xmm0, xmm1 + sub ecx, 16 movdqa [edx], xmm0 lea edx, [edx + 16] - sub ecx, 16 - ja wloop + jg wloop ret } } // Blends 32x2 rectangle to 16x1. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. -__declspec(naked) -static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) __declspec(align(16)) +void ScaleRowDown2Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { __asm { push esi mov eax, [esp + 4 + 4] // src_ptr mov esi, [esp + 4 + 8] // src_stride mov edx, [esp + 4 + 12] // dst_ptr mov ecx, [esp + 4 + 16] // dst_width - pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff - psrlw xmm7, 8 + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + align 16 wloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] @@ -319,16 +253,91 @@ static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, psrlw xmm0, 8 movdqa xmm3, xmm1 psrlw xmm1, 8 - pand xmm2, xmm7 - pand xmm3, xmm7 + pand xmm2, xmm5 + pand xmm3, xmm5 pavgw xmm0, xmm2 pavgw xmm1, xmm3 packuswb xmm0, xmm1 + sub ecx, 16 movdqa [edx], xmm0 lea edx, [edx + 16] + jg wloop + + pop esi + ret + } +} + +// Reads 32 pixels, throws half away and writes 16 pixels. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) __declspec(align(16)) +static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + align 16 + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 + pand xmm1, xmm5 + packuswb xmm0, xmm1 + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + ret + } +} +// Blends 32x2 rectangle to 16x1. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) __declspec(align(16)) +static void ScaleRowDown2Int_Unaligned_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + align 16 + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + psrlw xmm0, 8 + movdqa xmm3, xmm1 + psrlw xmm1, 8 + pand xmm2, xmm5 + pand xmm3, xmm5 + pavgw xmm0, xmm2 + pavgw xmm1, xmm3 + packuswb xmm0, xmm1 + sub ecx, 16 - ja wloop + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg wloop pop esi ret @@ -338,63 +347,64 @@ static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, #define HAS_SCALEROWDOWN4_SSE2 // Point samples 32 pixels to 8 pixels. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. -__declspec(naked) -static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, +__declspec(naked) __declspec(align(16)) +static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { - pushad - mov esi, [esp + 32 + 4] // src_ptr + mov eax, [esp + 4] // src_ptr // src_stride ignored - mov edi, [esp + 32 + 12] // dst_ptr - mov ecx, [esp + 32 + 16] // dst_width - pcmpeqb xmm7, xmm7 // generate mask 0x000000ff - psrld xmm7, 24 + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x000000ff + psrld xmm5, 24 + align 16 wloop: - movdqa xmm0, [esi] - movdqa xmm1, [esi + 16] - lea esi, [esi + 32] - pand xmm0, xmm7 - pand xmm1, xmm7 + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 + pand xmm1, xmm5 packuswb xmm0, xmm1 packuswb xmm0, xmm0 - movq qword ptr [edi], xmm0 - lea edi, [edi + 8] sub ecx, 8 - ja wloop + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] + jg wloop - popad ret } } // Blends 32x4 rectangle to 8x1. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. -__declspec(naked) -static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, +__declspec(naked) __declspec(align(16)) +static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { - pushad - mov esi, [esp + 32 + 4] // src_ptr - mov ebx, [esp + 32 + 8] // src_stride - mov edi, [esp + 32 + 12] // dst_ptr - mov ecx, [esp + 32 + 16] // dst_width + push esi + push edi + mov eax, [esp + 8 + 4] // src_ptr + mov esi, [esp + 8 + 8] // src_stride + mov edx, [esp + 8 + 12] // dst_ptr + mov ecx, [esp + 8 + 16] // dst_width + lea edi, [esi + esi * 2] // src_stride * 3 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff psrlw xmm7, 8 - lea edx, [ebx + ebx * 2] // src_stride * 3 + align 16 wloop: - movdqa xmm0, [esi] - movdqa xmm1, [esi + 16] - movdqa xmm2, [esi + ebx] - movdqa xmm3, [esi + ebx + 16] + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + esi] + movdqa xmm3, [eax + esi + 16] pavgb xmm0, xmm2 // average rows pavgb xmm1, xmm3 - movdqa xmm2, [esi + ebx * 2] - movdqa xmm3, [esi + ebx * 2 + 16] - movdqa xmm4, [esi + edx] - movdqa xmm5, [esi + edx + 16] - lea esi, [esi + 32] + movdqa xmm2, [eax + esi * 2] + movdqa xmm3, [eax + esi * 2 + 16] + movdqa xmm4, [eax + edi] + movdqa xmm5, [eax + edi + 16] + lea eax, [eax + 32] pavgb xmm2, xmm4 pavgb xmm3, xmm5 pavgb xmm0, xmm2 @@ -416,12 +426,13 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, pavgw xmm0, xmm2 packuswb xmm0, xmm0 - movq qword ptr [edi], xmm0 - lea edi, [edi + 8] sub ecx, 8 - ja wloop + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] + jg wloop - popad + pop edi + pop esi ret } } @@ -429,64 +440,66 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, #define HAS_SCALEROWDOWN8_SSE2 // Point samples 32 pixels to 4 pixels. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned. -__declspec(naked) -static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, +__declspec(naked) __declspec(align(16)) +static void ScaleRowDown8_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { - pushad - mov esi, [esp + 32 + 4] // src_ptr + mov eax, [esp + 4] // src_ptr // src_stride ignored - mov edi, [esp + 32 + 12] // dst_ptr - mov ecx, [esp + 32 + 16] // dst_width - pcmpeqb xmm7, xmm7 // generate mask isolating 1 src 8 bytes - psrlq xmm7, 56 + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask isolating 1 src 8 bytes + psrlq xmm5, 56 + align 16 wloop: - movdqa xmm0, [esi] - movdqa xmm1, [esi + 16] - lea esi, [esi + 32] - pand xmm0, xmm7 - pand xmm1, xmm7 + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 + pand xmm1, xmm5 packuswb xmm0, xmm1 // 32->16 packuswb xmm0, xmm0 // 16->8 packuswb xmm0, xmm0 // 8->4 - movd dword ptr [edi], xmm0 - lea edi, [edi + 4] sub ecx, 4 - ja wloop + movd dword ptr [edx], xmm0 + lea edx, [edx + 4] + jg wloop - popad ret } } // Blends 32x8 rectangle to 4x1. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned. -__declspec(naked) -static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, +__declspec(naked) __declspec(align(16)) +static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { - pushad - mov esi, [esp + 32 + 4] // src_ptr - mov ebx, [esp + 32 + 8] // src_stride - mov edi, [esp + 32 + 12] // dst_ptr - mov ecx, [esp + 32 + 16] // dst_width - lea edx, [ebx + ebx * 2] // src_stride * 3 + push esi + push edi + push ebp + mov eax, [esp + 12 + 4] // src_ptr + mov esi, [esp + 12 + 8] // src_stride + mov edx, [esp + 12 + 12] // dst_ptr + mov ecx, [esp + 12 + 16] // dst_width + lea edi, [esi + esi * 2] // src_stride * 3 pxor xmm7, xmm7 + align 16 wloop: - movdqa xmm0, [esi] // average 8 rows to 1 - movdqa xmm1, [esi + 16] - movdqa xmm2, [esi + ebx] - movdqa xmm3, [esi + ebx + 16] + movdqa xmm0, [eax] // average 8 rows to 1 + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + esi] + movdqa xmm3, [eax + esi + 16] pavgb xmm0, xmm2 pavgb xmm1, xmm3 - movdqa xmm2, [esi + ebx * 2] - movdqa xmm3, [esi + ebx * 2 + 16] - movdqa xmm4, [esi + edx] - movdqa xmm5, [esi + edx + 16] - lea ebp, [esi + ebx * 4] - lea esi, [esi + 32] + movdqa xmm2, [eax + esi * 2] + movdqa xmm3, [eax + esi * 2 + 16] + movdqa xmm4, [eax + edi] + movdqa xmm5, [eax + edi + 16] + lea ebp, [eax + esi * 4] + lea eax, [eax + 32] pavgb xmm2, xmm4 pavgb xmm3, xmm5 pavgb xmm0, xmm2 @@ -494,15 +507,15 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, movdqa xmm2, [ebp] movdqa xmm3, [ebp + 16] - movdqa xmm4, [ebp + ebx] - movdqa xmm5, [ebp + ebx + 16] + movdqa xmm4, [ebp + esi] + movdqa xmm5, [ebp + esi + 16] pavgb xmm2, xmm4 pavgb xmm3, xmm5 - movdqa xmm4, [ebp + ebx * 2] - movdqa xmm5, [ebp + ebx * 2 + 16] - movdqa xmm6, [ebp + edx] + movdqa xmm4, [ebp + esi * 2] + movdqa xmm5, [ebp + esi * 2 + 16] + movdqa xmm6, [ebp + edi] pavgb xmm4, xmm6 - movdqa xmm6, [ebp + edx + 16] + movdqa xmm6, [ebp + edi + 16] pavgb xmm5, xmm6 pavgb xmm2, xmm4 pavgb xmm3, xmm5 @@ -517,60 +530,61 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, psrlw xmm0, 3 packuswb xmm0, xmm0 packuswb xmm0, xmm0 - movd dword ptr [edi], xmm0 - lea edi, [edi + 4] sub ecx, 4 - ja wloop + movd dword ptr [edx], xmm0 + lea edx, [edx + 4] + jg wloop - popad + pop ebp + pop edi + pop esi ret } } #define HAS_SCALEROWDOWN34_SSSE3 // Point samples 32 pixels to 24 pixels. -// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. +// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. // Then shuffled to do the scaling. // Note that movdqa+palign may be better than movdqu. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. -__declspec(naked) -static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, +__declspec(naked) __declspec(align(16)) +static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { - pushad - mov esi, [esp + 32 + 4] // src_ptr + mov eax, [esp + 4] // src_ptr // src_stride ignored - mov edi, [esp + 32 + 12] // dst_ptr - mov ecx, [esp + 32 + 16] // dst_width - movdqa xmm3, _shuf0 - movdqa xmm4, _shuf1 - movdqa xmm5, _shuf2 + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + movdqa xmm3, kShuf0 + movdqa xmm4, kShuf1 + movdqa xmm5, kShuf2 + align 16 wloop: - movdqa xmm0, [esi] - movdqa xmm2, [esi + 16] - lea esi, [esi + 32] - movdqa xmm1, xmm2 + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + movdqa xmm2, xmm1 palignr xmm1, xmm0, 8 pshufb xmm0, xmm3 pshufb xmm1, xmm4 pshufb xmm2, xmm5 - movq qword ptr [edi], xmm0 - movq qword ptr [edi + 8], xmm1 - movq qword ptr [edi + 16], xmm2 - lea edi, [edi + 24] + movq qword ptr [edx], xmm0 + movq qword ptr [edx + 8], xmm1 + movq qword ptr [edx + 16], xmm2 + lea edx, [edx + 24] sub ecx, 24 - ja wloop + jg wloop - popad ret } } // Blends 32x2 rectangle to 24x1 -// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. +// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. // Then shuffled to do the scaling. // Register usage: @@ -581,86 +595,90 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, // xmm4 shuf 2 // xmm5 madd 0 // xmm6 madd 1 -// xmm7 round34 +// xmm7 kRound34 // Note that movdqa+palign may be better than movdqu. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. -__declspec(naked) -static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, +__declspec(naked) __declspec(align(16)) +static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { - pushad - mov esi, [esp + 32 + 4] // src_ptr - mov ebx, [esp + 32 + 8] // src_stride - mov edi, [esp + 32 + 12] // dst_ptr - mov ecx, [esp + 32 + 16] // dst_width - movdqa xmm2, _shuf01 - movdqa xmm3, _shuf11 - movdqa xmm4, _shuf21 - movdqa xmm5, _madd01 - movdqa xmm6, _madd11 - movdqa xmm7, _round34 - + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + movdqa xmm2, kShuf01 + movdqa xmm3, kShuf11 + movdqa xmm4, kShuf21 + movdqa xmm5, kMadd01 + movdqa xmm6, kMadd11 + movdqa xmm7, kRound34 + + align 16 wloop: - movdqa xmm0, [esi] // pixels 0..7 - movdqa xmm1, [esi+ebx] + movdqa xmm0, [eax] // pixels 0..7 + movdqa xmm1, [eax + esi] pavgb xmm0, xmm1 pshufb xmm0, xmm2 pmaddubsw xmm0, xmm5 paddsw xmm0, xmm7 psrlw xmm0, 2 packuswb xmm0, xmm0 - movq qword ptr [edi], xmm0 - movdqu xmm0, [esi+8] // pixels 8..15 - movdqu xmm1, [esi+ebx+8] + movq qword ptr [edx], xmm0 + movdqu xmm0, [eax + 8] // pixels 8..15 + movdqu xmm1, [eax + esi + 8] pavgb xmm0, xmm1 pshufb xmm0, xmm3 pmaddubsw xmm0, xmm6 paddsw xmm0, xmm7 psrlw xmm0, 2 packuswb xmm0, xmm0 - movq qword ptr [edi+8], xmm0 - movdqa xmm0, [esi+16] // pixels 16..23 - movdqa xmm1, [esi+ebx+16] - lea esi, [esi+32] + movq qword ptr [edx + 8], xmm0 + movdqa xmm0, [eax + 16] // pixels 16..23 + movdqa xmm1, [eax + esi + 16] + lea eax, [eax + 32] pavgb xmm0, xmm1 pshufb xmm0, xmm4 - movdqa xmm1, _madd21 + movdqa xmm1, kMadd21 pmaddubsw xmm0, xmm1 paddsw xmm0, xmm7 psrlw xmm0, 2 packuswb xmm0, xmm0 - movq qword ptr [edi+16], xmm0 - lea edi, [edi+24] sub ecx, 24 - ja wloop + movq qword ptr [edx + 16], xmm0 + lea edx, [edx + 24] + jg wloop - popad + pop esi ret } } // Note that movdqa+palign may be better than movdqu. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. -__declspec(naked) -static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, +__declspec(naked) __declspec(align(16)) +static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { - pushad - mov esi, [esp + 32 + 4] // src_ptr - mov ebx, [esp + 32 + 8] // src_stride - mov edi, [esp + 32 + 12] // dst_ptr - mov ecx, [esp + 32 + 16] // dst_width - movdqa xmm2, _shuf01 - movdqa xmm3, _shuf11 - movdqa xmm4, _shuf21 - movdqa xmm5, _madd01 - movdqa xmm6, _madd11 - movdqa xmm7, _round34 - + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + movdqa xmm2, kShuf01 + movdqa xmm3, kShuf11 + movdqa xmm4, kShuf21 + movdqa xmm5, kMadd01 + movdqa xmm6, kMadd11 + movdqa xmm7, kRound34 + + align 16 wloop: - movdqa xmm0, [esi] // pixels 0..7 - movdqa xmm1, [esi+ebx] + movdqa xmm0, [eax] // pixels 0..7 + movdqa xmm1, [eax + esi] pavgb xmm1, xmm0 pavgb xmm0, xmm1 pshufb xmm0, xmm2 @@ -668,9 +686,9 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, paddsw xmm0, xmm7 psrlw xmm0, 2 packuswb xmm0, xmm0 - movq qword ptr [edi], xmm0 - movdqu xmm0, [esi+8] // pixels 8..15 - movdqu xmm1, [esi+ebx+8] + movq qword ptr [edx], xmm0 + movdqu xmm0, [eax + 8] // pixels 8..15 + movdqu xmm1, [eax + esi + 8] pavgb xmm1, xmm0 pavgb xmm0, xmm1 pshufb xmm0, xmm3 @@ -678,24 +696,24 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, paddsw xmm0, xmm7 psrlw xmm0, 2 packuswb xmm0, xmm0 - movq qword ptr [edi+8], xmm0 - movdqa xmm0, [esi+16] // pixels 16..23 - movdqa xmm1, [esi+ebx+16] - lea esi, [esi+32] + movq qword ptr [edx + 8], xmm0 + movdqa xmm0, [eax + 16] // pixels 16..23 + movdqa xmm1, [eax + esi + 16] + lea eax, [eax + 32] pavgb xmm1, xmm0 pavgb xmm0, xmm1 pshufb xmm0, xmm4 - movdqa xmm1, _madd21 + movdqa xmm1, kMadd21 pmaddubsw xmm0, xmm1 paddsw xmm0, xmm7 psrlw xmm0, 2 packuswb xmm0, xmm0 - movq qword ptr [edi+16], xmm0 - lea edi, [edi+24] sub ecx, 24 - ja wloop + movq qword ptr [edx + 16], xmm0 + lea edx, [edx+24] + jg wloop - popad + pop esi ret } } @@ -704,202 +722,219 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, // 3/8 point sampler // Scale 32 pixels to 12 -__declspec(naked) -static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, +__declspec(naked) __declspec(align(16)) +static void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { - pushad - mov esi, [esp + 32 + 4] // src_ptr - mov edx, [esp + 32 + 8] // src_stride - mov edi, [esp + 32 + 12] // dst_ptr - mov ecx, [esp + 32 + 16] // dst_width - movdqa xmm5, _shuf38a - movdqa xmm6, _shuf38b - pxor xmm7, xmm7 + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + movdqa xmm4, kShuf38a + movdqa xmm5, kShuf38b + align 16 xloop: - movdqa xmm0, [esi] // 16 pixels -> 0,1,2,3,4,5 - movdqa xmm1, [esi + 16] // 16 pixels -> 6,7,8,9,10,11 - lea esi, [esi + 32] - pshufb xmm0, xmm5 - pshufb xmm1, xmm6 + movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 + movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 + lea eax, [eax + 32] + pshufb xmm0, xmm4 + pshufb xmm1, xmm5 paddusb xmm0, xmm1 - movq qword ptr [edi], xmm0 // write 12 pixels - movhlps xmm1, xmm0 - movd [edi + 8], xmm1 - lea edi, [edi + 12] sub ecx, 12 - ja xloop + movq qword ptr [edx], xmm0 // write 12 pixels + movhlps xmm1, xmm0 + movd [edx + 8], xmm1 + lea edx, [edx + 12] + jg xloop - popad ret } } // Scale 16x3 pixels to 6x1 with interpolation -__declspec(naked) -static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, +__declspec(naked) __declspec(align(16)) +static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { - pushad - mov esi, [esp + 32 + 4] // src_ptr - mov edx, [esp + 32 + 8] // src_stride - mov edi, [esp + 32 + 12] // dst_ptr - mov ecx, [esp + 32 + 16] // dst_width - movdqa xmm4, _shufac0 - movdqa xmm5, _shufac3 - movdqa xmm6, _scaleac3 - pxor xmm7, xmm7 + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + movdqa xmm2, kShufAc + movdqa xmm3, kShufAc3 + movdqa xmm4, kScaleAc33 + pxor xmm5, xmm5 + align 16 xloop: - movdqa xmm0, [esi] // sum up 3 rows into xmm0/1 - movdqa xmm2, [esi + edx] + movdqa xmm0, [eax] // sum up 3 rows into xmm0/1 + movdqa xmm6, [eax + esi] movhlps xmm1, xmm0 - movhlps xmm3, xmm2 - punpcklbw xmm0, xmm7 - punpcklbw xmm1, xmm7 - punpcklbw xmm2, xmm7 - punpcklbw xmm3, xmm7 - paddusw xmm0, xmm2 - paddusw xmm1, xmm3 - movdqa xmm2, [esi + edx * 2] - lea esi, [esi + 16] - movhlps xmm3, xmm2 - punpcklbw xmm2, xmm7 - punpcklbw xmm3, xmm7 - paddusw xmm0, xmm2 - paddusw xmm1, xmm3 - - movdqa xmm2, xmm0 // 8 pixels -> 0,1,2 of xmm2 + movhlps xmm7, xmm6 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + punpcklbw xmm6, xmm5 + punpcklbw xmm7, xmm5 + paddusw xmm0, xmm6 + paddusw xmm1, xmm7 + movdqa xmm6, [eax + esi * 2] + lea eax, [eax + 16] + movhlps xmm7, xmm6 + punpcklbw xmm6, xmm5 + punpcklbw xmm7, xmm5 + paddusw xmm0, xmm6 + paddusw xmm1, xmm7 + + movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 psrldq xmm0, 2 - paddusw xmm2, xmm0 + paddusw xmm6, xmm0 psrldq xmm0, 2 - paddusw xmm2, xmm0 - pshufb xmm2, xmm4 + paddusw xmm6, xmm0 + pshufb xmm6, xmm2 - movdqa xmm3, xmm1 // 8 pixels -> 3,4,5 of xmm2 + movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 psrldq xmm1, 2 - paddusw xmm3, xmm1 + paddusw xmm7, xmm1 psrldq xmm1, 2 - paddusw xmm3, xmm1 - pshufb xmm3, xmm5 - paddusw xmm2, xmm3 + paddusw xmm7, xmm1 + pshufb xmm7, xmm3 + paddusw xmm6, xmm7 - pmulhuw xmm2, xmm6 // divide by 9,9,6, 9,9,6 - packuswb xmm2, xmm2 + pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 + packuswb xmm6, xmm6 - movd [edi], xmm2 // write 6 pixels - pextrw eax, xmm2, 2 - mov [edi + 4], ax - lea edi, [edi + 6] sub ecx, 6 - ja xloop + movd [edx], xmm6 // write 6 pixels + psrlq xmm6, 16 + movd [edx + 2], xmm6 + lea edx, [edx + 6] + jg xloop - popad + pop esi ret } } // Scale 16x2 pixels to 6x1 with interpolation -__declspec(naked) -static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, +__declspec(naked) __declspec(align(16)) +static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { - pushad - mov esi, [esp + 32 + 4] // src_ptr - mov edx, [esp + 32 + 8] // src_stride - mov edi, [esp + 32 + 12] // dst_ptr - mov ecx, [esp + 32 + 16] // dst_width - movdqa xmm4, _shufab0 - movdqa xmm5, _shufab1 - movdqa xmm6, _shufab2 - movdqa xmm7, _scaleab2 + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + movdqa xmm2, kShufAb0 + movdqa xmm3, kShufAb1 + movdqa xmm4, kShufAb2 + movdqa xmm5, kScaleAb2 + align 16 xloop: - movdqa xmm2, [esi] // average 2 rows into xmm2 - pavgb xmm2, [esi + edx] - lea esi, [esi + 16] - - movdqa xmm0, xmm2 // 16 pixels -> 0,1,2,3,4,5 of xmm0 + movdqa xmm0, [eax] // average 2 rows into xmm0 + pavgb xmm0, [eax + esi] + lea eax, [eax + 16] + + movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 + pshufb xmm1, xmm2 + movdqa xmm6, xmm0 + pshufb xmm6, xmm3 + paddusw xmm1, xmm6 pshufb xmm0, xmm4 - movdqa xmm1, xmm2 - pshufb xmm1, xmm5 - paddusw xmm0, xmm1 - pshufb xmm2, xmm6 - paddusw xmm0, xmm2 + paddusw xmm1, xmm0 - pmulhuw xmm0, xmm7 // divide by 3,3,2, 3,3,2 - packuswb xmm0, xmm0 + pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 + packuswb xmm1, xmm1 - movd [edi], xmm0 // write 6 pixels - pextrw eax, xmm0, 2 - mov [edi + 4], ax - lea edi, [edi + 6] sub ecx, 6 - ja xloop + movd [edx], xmm1 // write 6 pixels + psrlq xmm1, 16 + movd [edx + 2], xmm1 + lea edx, [edx + 6] + jg xloop - popad + pop esi ret } } #define HAS_SCALEADDROWS_SSE2 -// Reads 8xN bytes and produces 16 shorts at a time. -__declspec(naked) -static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, +// Reads 16xN bytes and produces 16 shorts at a time. +__declspec(naked) __declspec(align(16)) +static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint16* dst_ptr, int src_width, int src_height) { __asm { - pushad - mov esi, [esp + 32 + 4] // src_ptr - mov edx, [esp + 32 + 8] // src_stride - mov edi, [esp + 32 + 12] // dst_ptr - mov ecx, [esp + 32 + 16] // dst_width - mov ebx, [esp + 32 + 20] // height - pxor xmm7, xmm7 + push esi + push edi + push ebx + push ebp + mov esi, [esp + 16 + 4] // src_ptr + mov edx, [esp + 16 + 8] // src_stride + mov edi, [esp + 16 + 12] // dst_ptr + mov ecx, [esp + 16 + 16] // dst_width + mov ebx, [esp + 16 + 20] // height + pxor xmm4, xmm4 dec ebx + align 16 xloop: // first row - movdqa xmm2, [esi] + movdqa xmm0, [esi] lea eax, [esi + edx] - movhlps xmm3, xmm2 + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm4 + punpckhbw xmm1, xmm4 + lea esi, [esi + 16] mov ebp, ebx - punpcklbw xmm2, xmm7 - punpcklbw xmm3, xmm7 + test ebp, ebp + je ydone // sum remaining rows + align 16 yloop: - movdqa xmm0, [eax] // read 16 pixels + movdqa xmm2, [eax] // read 16 pixels lea eax, [eax + edx] // advance to next row - movhlps xmm1, xmm0 - punpcklbw xmm0, xmm7 - punpcklbw xmm1, xmm7 - paddusw xmm2, xmm0 // sum 16 words - paddusw xmm3, xmm1 + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm4 + punpckhbw xmm3, xmm4 + paddusw xmm0, xmm2 // sum 16 words + paddusw xmm1, xmm3 sub ebp, 1 - ja yloop - - movdqa [edi], xmm2 - movdqa [edi + 16], xmm3 + jg yloop + ydone: + movdqa [edi], xmm0 + movdqa [edi + 16], xmm1 lea edi, [edi + 32] - lea esi, [esi + 16] sub ecx, 16 - ja xloop + jg xloop - popad + pop ebp + pop ebx + pop edi + pop esi ret } } +#ifndef SSE2_DISABLED // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version. -#define HAS_SCALEFILTERROWS_SSE2 -__declspec(naked) +// Normal formula for bilinear interpolation is: +// source_y_fraction * row1 + (1 - source_y_fraction) row0 +// SSE2 version using the a single multiply of difference: +// source_y_fraction * (row1 - row0) + row0 +#define HAS_SCALEFILTERROWS_SSE2_DISABLED +__declspec(naked) __declspec(align(16)) static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, - int src_stride, int dst_width, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) { __asm { push esi @@ -909,88 +944,88 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, mov edx, [esp + 8 + 12] // src_stride mov ecx, [esp + 8 + 16] // dst_width mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + sub edi, esi cmp eax, 0 je xloop1 cmp eax, 128 je xloop2 - movd xmm6, eax // xmm6 = y fraction - punpcklwd xmm6, xmm6 - pshufd xmm6, xmm6, 0 - neg eax // xmm5 = 256 - y fraction - add eax, 256 - movd xmm5, eax + movd xmm5, eax // xmm5 = y fraction + punpcklbw xmm5, xmm5 punpcklwd xmm5, xmm5 pshufd xmm5, xmm5, 0 - pxor xmm7, xmm7 + pxor xmm4, xmm4 + align 16 xloop: - movdqa xmm0, [esi] - movdqa xmm2, [esi + edx] - lea esi, [esi + 16] + movdqa xmm0, [esi] // row0 + movdqa xmm2, [esi + edx] // row1 movdqa xmm1, xmm0 movdqa xmm3, xmm2 - punpcklbw xmm0, xmm7 - punpcklbw xmm2, xmm7 - punpckhbw xmm1, xmm7 - punpckhbw xmm3, xmm7 - pmullw xmm0, xmm5 // scale row 0 - pmullw xmm1, xmm5 - pmullw xmm2, xmm6 // scale row 1 - pmullw xmm3, xmm6 - paddusw xmm0, xmm2 // sum rows - paddusw xmm1, xmm3 - psrlw xmm0, 8 - psrlw xmm1, 8 + punpcklbw xmm2, xmm4 + punpckhbw xmm3, xmm4 + punpcklbw xmm0, xmm4 + punpckhbw xmm1, xmm4 + psubw xmm2, xmm0 // row1 - row0 + psubw xmm3, xmm1 + pmulhw xmm2, xmm5 // scale diff + pmulhw xmm3, xmm5 + paddw xmm0, xmm2 // sum rows + paddw xmm1, xmm3 packuswb xmm0, xmm1 - movdqa [edi], xmm0 - lea edi, [edi + 16] sub ecx, 16 - ja xloop + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop - mov al, [edi - 1] - mov [edi], al + punpckhbw xmm0, xmm0 // duplicate last pixel for filtering + pshufhw xmm0, xmm0, 0xff + punpckhqdq xmm0, xmm0 + movdqa [esi + edi], xmm0 pop edi pop esi ret + align 16 xloop1: movdqa xmm0, [esi] - lea esi, [esi + 16] - movdqa [edi], xmm0 - lea edi, [edi + 16] sub ecx, 16 - ja xloop1 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop1 - mov al, [edi - 1] - mov [edi], al + punpckhbw xmm0, xmm0 // duplicate last pixel for filtering + pshufhw xmm0, xmm0, 0xff + punpckhqdq xmm0, xmm0 + movdqa [esi + edi], xmm0 pop edi pop esi ret + align 16 xloop2: movdqa xmm0, [esi] - movdqa xmm2, [esi + edx] - lea esi, [esi + 16] - pavgb xmm0, xmm2 - movdqa [edi], xmm0 - lea edi, [edi + 16] + pavgb xmm0, [esi + edx] sub ecx, 16 - ja xloop2 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop2 - mov al, [edi - 1] - mov [edi], al + punpckhbw xmm0, xmm0 // duplicate last pixel for filtering + pshufhw xmm0, xmm0, 0xff + punpckhqdq xmm0, xmm0 + movdqa [esi + edi], xmm0 pop edi pop esi ret } } - +#endif // SSE2_DISABLED // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version. #define HAS_SCALEFILTERROWS_SSSE3 -__declspec(naked) +__declspec(naked) __declspec(align(16)) static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, - int src_stride, int dst_width, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) { __asm { push esi @@ -1000,1491 +1035,996 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, mov edx, [esp + 8 + 12] // src_stride mov ecx, [esp + 8 + 16] // dst_width mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + sub edi, esi + shr eax, 1 cmp eax, 0 je xloop1 - cmp eax, 128 + cmp eax, 64 je xloop2 + movd xmm0, eax // high fraction 0..127 + neg eax + add eax, 128 + movd xmm5, eax // low fraction 128..1 + punpcklbw xmm5, xmm0 + punpcklwd xmm5, xmm5 + pshufd xmm5, xmm5, 0 - shr eax, 1 - mov ah,al - neg al - add al, 128 - movd xmm7, eax - punpcklwd xmm7, xmm7 - pshufd xmm7, xmm7, 0 - + align 16 xloop: movdqa xmm0, [esi] movdqa xmm2, [esi + edx] - lea esi, [esi + 16] movdqa xmm1, xmm0 punpcklbw xmm0, xmm2 punpckhbw xmm1, xmm2 - pmaddubsw xmm0, xmm7 - pmaddubsw xmm1, xmm7 + pmaddubsw xmm0, xmm5 + pmaddubsw xmm1, xmm5 psrlw xmm0, 7 psrlw xmm1, 7 packuswb xmm0, xmm1 - movdqa [edi], xmm0 - lea edi, [edi + 16] sub ecx, 16 - ja xloop + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop + + punpckhbw xmm0, xmm0 // duplicate last pixel for filtering + pshufhw xmm0, xmm0, 0xff + punpckhqdq xmm0, xmm0 + movdqa [esi + edi], xmm0 - mov al, [edi - 1] - mov [edi], al pop edi pop esi ret + align 16 xloop1: movdqa xmm0, [esi] - lea esi, [esi + 16] - movdqa [edi], xmm0 - lea edi, [edi + 16] sub ecx, 16 - ja xloop1 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop1 - mov al, [edi - 1] - mov [edi], al + punpckhbw xmm0, xmm0 + pshufhw xmm0, xmm0, 0xff + punpckhqdq xmm0, xmm0 + movdqa [esi + edi], xmm0 pop edi pop esi ret + align 16 xloop2: movdqa xmm0, [esi] - movdqa xmm2, [esi + edx] - lea esi, [esi + 16] - pavgb xmm0, xmm2 - movdqa [edi], xmm0 - lea edi, [edi + 16] + pavgb xmm0, [esi + edx] sub ecx, 16 - ja xloop2 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop2 - mov al, [edi - 1] - mov [edi], al + punpckhbw xmm0, xmm0 + pshufhw xmm0, xmm0, 0xff + punpckhqdq xmm0, xmm0 + movdqa [esi + edi], xmm0 pop edi pop esi ret - } } -// Note that movdqa+palign may be better than movdqu. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. -__declspec(naked) -static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr, - int dst_width) { - __asm { - mov edx, [esp + 4] // dst_ptr - mov eax, [esp + 8] // src_ptr - mov ecx, [esp + 12] // dst_width - movdqa xmm1, _round34 - movdqa xmm2, _shuf01 - movdqa xmm3, _shuf11 - movdqa xmm4, _shuf21 - movdqa xmm5, _madd01 - movdqa xmm6, _madd11 - movdqa xmm7, _madd21 - - wloop: - movdqa xmm0, [eax] // pixels 0..7 - pshufb xmm0, xmm2 - pmaddubsw xmm0, xmm5 - paddsw xmm0, xmm1 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 - movdqu xmm0, [eax+8] // pixels 8..15 - pshufb xmm0, xmm3 - pmaddubsw xmm0, xmm6 - paddsw xmm0, xmm1 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx+8], xmm0 - movdqa xmm0, [eax+16] // pixels 16..23 - lea eax, [eax+32] - pshufb xmm0, xmm4 - pmaddubsw xmm0, xmm7 - paddsw xmm0, xmm1 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx+16], xmm0 - lea edx, [edx+24] - sub ecx, 24 - ja wloop - ret - } -} - -#elif (defined(__x86_64__) || defined(__i386__)) && \ - !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) +#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__)) // GCC versions of row functions are verbatim conversions from Visual C. // Generated using gcc disassembly on Visual C object file: // objdump -D yuvscaler.obj >yuvscaler.txt #define HAS_SCALEROWDOWN2_SSE2 -static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, +static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { - asm volatile( - "pcmpeqb %%xmm7,%%xmm7\n" - "psrlw $0x8,%%xmm7\n" -"1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "lea 0x20(%0),%0\n" - "pand %%xmm7,%%xmm0\n" - "pand %%xmm7,%%xmm1\n" - "packuswb %%xmm1,%%xmm0\n" - "movdqa %%xmm0,(%1)\n" - "lea 0x10(%1),%1\n" - "sub $0x10,%2\n" - "ja 1b\n" + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : - : "memory" -); + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); } -static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile( - "pcmpeqb %%xmm7,%%xmm7\n" - "psrlw $0x8,%%xmm7\n" -"1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "movdqa (%0,%3,1),%%xmm2\n" - "movdqa 0x10(%0,%3,1),%%xmm3\n" - "lea 0x20(%0),%0\n" - "pavgb %%xmm2,%%xmm0\n" - "pavgb %%xmm3,%%xmm1\n" - "movdqa %%xmm0,%%xmm2\n" - "psrlw $0x8,%%xmm0\n" - "movdqa %%xmm1,%%xmm3\n" - "psrlw $0x8,%%xmm1\n" - "pand %%xmm7,%%xmm2\n" - "pand %%xmm7,%%xmm3\n" - "pavgw %%xmm2,%%xmm0\n" - "pavgw %%xmm3,%%xmm1\n" - "packuswb %%xmm1,%%xmm0\n" - "movdqa %%xmm0,(%1)\n" - "lea 0x10(%1),%1\n" - "sub $0x10,%2\n" - "ja 1b\n" +void ScaleRowDown2Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa (%0,%3,1),%%xmm2 \n" + "movdqa 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm5,%%xmm3 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(static_cast<intptr_t>(src_stride)) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} +static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + ".p2align 4 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +static void ScaleRowDown2Int_Unaligned_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + ".p2align 4 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu (%0,%3,1),%%xmm2 \n" + "movdqu 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm5,%%xmm3 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "r"(static_cast<intptr_t>(src_stride)) // %3 - : "memory" -); + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); } #define HAS_SCALEROWDOWN4_SSE2 -static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, +static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { - asm volatile( - "pcmpeqb %%xmm7,%%xmm7\n" - "psrld $0x18,%%xmm7\n" -"1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "lea 0x20(%0),%0\n" - "pand %%xmm7,%%xmm0\n" - "pand %%xmm7,%%xmm1\n" - "packuswb %%xmm1,%%xmm0\n" - "packuswb %%xmm0,%%xmm0\n" - "movq %%xmm0,(%1)\n" - "lea 0x8(%1),%1\n" - "sub $0x8,%2\n" - "ja 1b\n" + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrld $0x18,%%xmm5 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : - : "memory" -); + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); } -static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, +static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { - intptr_t temp = 0; - asm volatile( - "pcmpeqb %%xmm7,%%xmm7\n" - "psrlw $0x8,%%xmm7\n" - "lea (%4,%4,2),%3\n" -"1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "movdqa (%0,%4,1),%%xmm2\n" - "movdqa 0x10(%0,%4,1),%%xmm3\n" - "pavgb %%xmm2,%%xmm0\n" - "pavgb %%xmm3,%%xmm1\n" - "movdqa (%0,%4,2),%%xmm2\n" - "movdqa 0x10(%0,%4,2),%%xmm3\n" - "movdqa (%0,%3,1),%%xmm4\n" - "movdqa 0x10(%0,%3,1),%%xmm5\n" - "lea 0x20(%0),%0\n" - "pavgb %%xmm4,%%xmm2\n" - "pavgb %%xmm2,%%xmm0\n" - "pavgb %%xmm5,%%xmm3\n" - "pavgb %%xmm3,%%xmm1\n" - "movdqa %%xmm0,%%xmm2\n" - "psrlw $0x8,%%xmm0\n" - "movdqa %%xmm1,%%xmm3\n" - "psrlw $0x8,%%xmm1\n" - "pand %%xmm7,%%xmm2\n" - "pand %%xmm7,%%xmm3\n" - "pavgw %%xmm2,%%xmm0\n" - "pavgw %%xmm3,%%xmm1\n" - "packuswb %%xmm1,%%xmm0\n" - "movdqa %%xmm0,%%xmm2\n" - "psrlw $0x8,%%xmm0\n" - "pand %%xmm7,%%xmm2\n" - "pavgw %%xmm2,%%xmm0\n" - "packuswb %%xmm0,%%xmm0\n" - "movq %%xmm0,(%1)\n" - "lea 0x8(%1),%1\n" - "sub $0x8,%2\n" - "ja 1b\n" + intptr_t stridex3 = 0; + asm volatile ( + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $0x8,%%xmm7 \n" + "lea (%4,%4,2),%3 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa (%0,%4,1),%%xmm2 \n" + "movdqa 0x10(%0,%4,1),%%xmm3 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa (%0,%4,2),%%xmm2 \n" + "movdqa 0x10(%0,%4,2),%%xmm3 \n" + "movdqa (%0,%3,1),%%xmm4 \n" + "movdqa 0x10(%0,%3,1),%%xmm5 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm4,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm5,%%xmm3 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "pand %%xmm7,%%xmm2 \n" + "pand %%xmm7,%%xmm3 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "pand %%xmm7,%%xmm2 \n" + "pavgw %%xmm2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 - "+r"(temp) // %3 + "+r"(stridex3) // %3 : "r"(static_cast<intptr_t>(src_stride)) // %4 - : "memory" -); + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7" +#endif + ); } #define HAS_SCALEROWDOWN8_SSE2 -static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, +static void ScaleRowDown8_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { - asm volatile( - "pcmpeqb %%xmm7,%%xmm7\n" - "psrlq $0x38,%%xmm7\n" -"1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "lea 0x20(%0),%0\n" - "pand %%xmm7,%%xmm0\n" - "pand %%xmm7,%%xmm1\n" - "packuswb %%xmm1,%%xmm0\n" - "packuswb %%xmm0,%%xmm0\n" - "packuswb %%xmm0,%%xmm0\n" - "movd %%xmm0,(%1)\n" - "lea 0x4(%1),%1\n" - "sub $0x4,%2\n" - "ja 1b\n" + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlq $0x38,%%xmm5 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,(%1) \n" + "lea 0x4(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : - : "memory" -); -} - -#if defined(__i386__) -extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width); - asm( - ".text\n" -#if defined(OSX) - ".globl _ScaleRowDown8Int_SSE2\n" -"_ScaleRowDown8Int_SSE2:\n" -#else - ".global ScaleRowDown8Int_SSE2\n" -"ScaleRowDown8Int_SSE2:\n" + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" #endif - "pusha\n" - "mov 0x24(%esp),%esi\n" - "mov 0x28(%esp),%ebx\n" - "mov 0x2c(%esp),%edi\n" - "mov 0x30(%esp),%ecx\n" - "lea (%ebx,%ebx,2),%edx\n" - "pxor %xmm7,%xmm7\n" - -"1:" - "movdqa (%esi),%xmm0\n" - "movdqa 0x10(%esi),%xmm1\n" - "movdqa (%esi,%ebx,1),%xmm2\n" - "movdqa 0x10(%esi,%ebx,1),%xmm3\n" - "pavgb %xmm2,%xmm0\n" - "pavgb %xmm3,%xmm1\n" - "movdqa (%esi,%ebx,2),%xmm2\n" - "movdqa 0x10(%esi,%ebx,2),%xmm3\n" - "movdqa (%esi,%edx,1),%xmm4\n" - "movdqa 0x10(%esi,%edx,1),%xmm5\n" - "lea (%esi,%ebx,4),%ebp\n" - "lea 0x20(%esi),%esi\n" - "pavgb %xmm4,%xmm2\n" - "pavgb %xmm5,%xmm3\n" - "pavgb %xmm2,%xmm0\n" - "pavgb %xmm3,%xmm1\n" - "movdqa 0x0(%ebp),%xmm2\n" - "movdqa 0x10(%ebp),%xmm3\n" - "movdqa 0x0(%ebp,%ebx,1),%xmm4\n" - "movdqa 0x10(%ebp,%ebx,1),%xmm5\n" - "pavgb %xmm4,%xmm2\n" - "pavgb %xmm5,%xmm3\n" - "movdqa 0x0(%ebp,%ebx,2),%xmm4\n" - "movdqa 0x10(%ebp,%ebx,2),%xmm5\n" - "movdqa 0x0(%ebp,%edx,1),%xmm6\n" - "pavgb %xmm6,%xmm4\n" - "movdqa 0x10(%ebp,%edx,1),%xmm6\n" - "pavgb %xmm6,%xmm5\n" - "pavgb %xmm4,%xmm2\n" - "pavgb %xmm5,%xmm3\n" - "pavgb %xmm2,%xmm0\n" - "pavgb %xmm3,%xmm1\n" - "psadbw %xmm7,%xmm0\n" - "psadbw %xmm7,%xmm1\n" - "pshufd $0xd8,%xmm0,%xmm0\n" - "pshufd $0x8d,%xmm1,%xmm1\n" - "por %xmm1,%xmm0\n" - "psrlw $0x3,%xmm0\n" - "packuswb %xmm0,%xmm0\n" - "packuswb %xmm0,%xmm0\n" - "movd %xmm0,(%edi)\n" - "lea 0x4(%edi),%edi\n" - "sub $0x4,%ecx\n" - "ja 1b\n" - "popa\n" - "ret\n" -); - -// fpic is used for magiccam plugin -#if !defined(__PIC__) -#define HAS_SCALEROWDOWN34_SSSE3 -extern "C" void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width); - asm( - ".text\n" -#if defined(OSX) - ".globl _ScaleRowDown34_SSSE3\n" -"_ScaleRowDown34_SSSE3:\n" -#else - ".global ScaleRowDown34_SSSE3\n" -"ScaleRowDown34_SSSE3:\n" -#endif - "pusha\n" - "mov 0x24(%esp),%esi\n" - "mov 0x2c(%esp),%edi\n" - "mov 0x30(%esp),%ecx\n" - "movdqa _shuf0,%xmm3\n" - "movdqa _shuf1,%xmm4\n" - "movdqa _shuf2,%xmm5\n" - -"1:" - "movdqa (%esi),%xmm0\n" - "movdqa 0x10(%esi),%xmm2\n" - "lea 0x20(%esi),%esi\n" - "movdqa %xmm2,%xmm1\n" - "palignr $0x8,%xmm0,%xmm1\n" - "pshufb %xmm3,%xmm0\n" - "pshufb %xmm4,%xmm1\n" - "pshufb %xmm5,%xmm2\n" - "movq %xmm0,(%edi)\n" - "movq %xmm1,0x8(%edi)\n" - "movq %xmm2,0x10(%edi)\n" - "lea 0x18(%edi),%edi\n" - "sub $0x18,%ecx\n" - "ja 1b\n" - "popa\n" - "ret\n" -); - -extern "C" void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width); - asm( - ".text\n" -#if defined(OSX) - ".globl _ScaleRowDown34_1_Int_SSSE3\n" -"_ScaleRowDown34_1_Int_SSSE3:\n" -#else - ".global ScaleRowDown34_1_Int_SSSE3\n" -"ScaleRowDown34_1_Int_SSSE3:\n" -#endif - "pusha\n" - "mov 0x24(%esp),%esi\n" - "mov 0x28(%esp),%ebp\n" - "mov 0x2c(%esp),%edi\n" - "mov 0x30(%esp),%ecx\n" - "movdqa _shuf01,%xmm2\n" - "movdqa _shuf11,%xmm3\n" - "movdqa _shuf21,%xmm4\n" - "movdqa _madd01,%xmm5\n" - "movdqa _madd11,%xmm6\n" - "movdqa _round34,%xmm7\n" - -"1:" - "movdqa (%esi),%xmm0\n" - "movdqa (%esi,%ebp),%xmm1\n" - "pavgb %xmm1,%xmm0\n" - "pshufb %xmm2,%xmm0\n" - "pmaddubsw %xmm5,%xmm0\n" - "paddsw %xmm7,%xmm0\n" - "psrlw $0x2,%xmm0\n" - "packuswb %xmm0,%xmm0\n" - "movq %xmm0,(%edi)\n" - "movdqu 0x8(%esi),%xmm0\n" - "movdqu 0x8(%esi,%ebp),%xmm1\n" - "pavgb %xmm1,%xmm0\n" - "pshufb %xmm3,%xmm0\n" - "pmaddubsw %xmm6,%xmm0\n" - "paddsw %xmm7,%xmm0\n" - "psrlw $0x2,%xmm0\n" - "packuswb %xmm0,%xmm0\n" - "movq %xmm0,0x8(%edi)\n" - "movdqa 0x10(%esi),%xmm0\n" - "movdqa 0x10(%esi,%ebp),%xmm1\n" - "lea 0x20(%esi),%esi\n" - "pavgb %xmm1,%xmm0\n" - "pshufb %xmm4,%xmm0\n" - "movdqa _madd21,%xmm1\n" - "pmaddubsw %xmm1,%xmm0\n" - "paddsw %xmm7,%xmm0\n" - "psrlw $0x2,%xmm0\n" - "packuswb %xmm0,%xmm0\n" - "movq %xmm0,0x10(%edi)\n" - "lea 0x18(%edi),%edi\n" - "sub $0x18,%ecx\n" - "ja 1b\n" - - "popa\n" - "ret\n" -); - -extern "C" void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width); - asm( - ".text\n" -#if defined(OSX) - ".globl _ScaleRowDown34_0_Int_SSSE3\n" -"_ScaleRowDown34_0_Int_SSSE3:\n" -#else - ".global ScaleRowDown34_0_Int_SSSE3\n" -"ScaleRowDown34_0_Int_SSSE3:\n" -#endif - "pusha\n" - "mov 0x24(%esp),%esi\n" - "mov 0x28(%esp),%ebp\n" - "mov 0x2c(%esp),%edi\n" - "mov 0x30(%esp),%ecx\n" - "movdqa _shuf01,%xmm2\n" - "movdqa _shuf11,%xmm3\n" - "movdqa _shuf21,%xmm4\n" - "movdqa _madd01,%xmm5\n" - "movdqa _madd11,%xmm6\n" - "movdqa _round34,%xmm7\n" - -"1:" - "movdqa (%esi),%xmm0\n" - "movdqa (%esi,%ebp,1),%xmm1\n" - "pavgb %xmm0,%xmm1\n" - "pavgb %xmm1,%xmm0\n" - "pshufb %xmm2,%xmm0\n" - "pmaddubsw %xmm5,%xmm0\n" - "paddsw %xmm7,%xmm0\n" - "psrlw $0x2,%xmm0\n" - "packuswb %xmm0,%xmm0\n" - "movq %xmm0,(%edi)\n" - "movdqu 0x8(%esi),%xmm0\n" - "movdqu 0x8(%esi,%ebp,1),%xmm1\n" - "pavgb %xmm0,%xmm1\n" - "pavgb %xmm1,%xmm0\n" - "pshufb %xmm3,%xmm0\n" - "pmaddubsw %xmm6,%xmm0\n" - "paddsw %xmm7,%xmm0\n" - "psrlw $0x2,%xmm0\n" - "packuswb %xmm0,%xmm0\n" - "movq %xmm0,0x8(%edi)\n" - "movdqa 0x10(%esi),%xmm0\n" - "movdqa 0x10(%esi,%ebp,1),%xmm1\n" - "lea 0x20(%esi),%esi\n" - "pavgb %xmm0,%xmm1\n" - "pavgb %xmm1,%xmm0\n" - "pshufb %xmm4,%xmm0\n" - "movdqa _madd21,%xmm1\n" - "pmaddubsw %xmm1,%xmm0\n" - "paddsw %xmm7,%xmm0\n" - "psrlw $0x2,%xmm0\n" - "packuswb %xmm0,%xmm0\n" - "movq %xmm0,0x10(%edi)\n" - "lea 0x18(%edi),%edi\n" - "sub $0x18,%ecx\n" - "ja 1b\n" - "popa\n" - "ret\n" -); - -#define HAS_SCALEROWDOWN38_SSSE3 -extern "C" void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width); - asm( - ".text\n" -#if defined(OSX) - ".globl _ScaleRowDown38_SSSE3\n" -"_ScaleRowDown38_SSSE3:\n" -#else - ".global ScaleRowDown38_SSSE3\n" -"ScaleRowDown38_SSSE3:\n" -#endif - "pusha\n" - "mov 0x24(%esp),%esi\n" - "mov 0x28(%esp),%edx\n" - "mov 0x2c(%esp),%edi\n" - "mov 0x30(%esp),%ecx\n" - "movdqa _shuf38a ,%xmm5\n" - "movdqa _shuf38b ,%xmm6\n" - "pxor %xmm7,%xmm7\n" - -"1:" - "movdqa (%esi),%xmm0\n" - "movdqa 0x10(%esi),%xmm1\n" - "lea 0x20(%esi),%esi\n" - "pshufb %xmm5,%xmm0\n" - "pshufb %xmm6,%xmm1\n" - "paddusb %xmm1,%xmm0\n" - "movq %xmm0,(%edi)\n" - "movhlps %xmm0,%xmm1\n" - "movd %xmm1,0x8(%edi)\n" - "lea 0xc(%edi),%edi\n" - "sub $0xc,%ecx\n" - "ja 1b\n" - "popa\n" - "ret\n" -); - -extern "C" void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width); - asm( - ".text\n" -#if defined(OSX) - ".globl _ScaleRowDown38_3_Int_SSSE3\n" -"_ScaleRowDown38_3_Int_SSSE3:\n" -#else - ".global ScaleRowDown38_3_Int_SSSE3\n" -"ScaleRowDown38_3_Int_SSSE3:\n" -#endif - "pusha\n" - "mov 0x24(%esp),%esi\n" - "mov 0x28(%esp),%edx\n" - "mov 0x2c(%esp),%edi\n" - "mov 0x30(%esp),%ecx\n" - "movdqa _shufac0,%xmm4\n" - "movdqa _shufac3,%xmm5\n" - "movdqa _scaleac3,%xmm6\n" - "pxor %xmm7,%xmm7\n" - -"1:" - "movdqa (%esi),%xmm0\n" - "movdqa (%esi,%edx,1),%xmm2\n" - "movhlps %xmm0,%xmm1\n" - "movhlps %xmm2,%xmm3\n" - "punpcklbw %xmm7,%xmm0\n" - "punpcklbw %xmm7,%xmm1\n" - "punpcklbw %xmm7,%xmm2\n" - "punpcklbw %xmm7,%xmm3\n" - "paddusw %xmm2,%xmm0\n" - "paddusw %xmm3,%xmm1\n" - "movdqa (%esi,%edx,2),%xmm2\n" - "lea 0x10(%esi),%esi\n" - "movhlps %xmm2,%xmm3\n" - "punpcklbw %xmm7,%xmm2\n" - "punpcklbw %xmm7,%xmm3\n" - "paddusw %xmm2,%xmm0\n" - "paddusw %xmm3,%xmm1\n" - "movdqa %xmm0,%xmm2\n" - "psrldq $0x2,%xmm0\n" - "paddusw %xmm0,%xmm2\n" - "psrldq $0x2,%xmm0\n" - "paddusw %xmm0,%xmm2\n" - "pshufb %xmm4,%xmm2\n" - "movdqa %xmm1,%xmm3\n" - "psrldq $0x2,%xmm1\n" - "paddusw %xmm1,%xmm3\n" - "psrldq $0x2,%xmm1\n" - "paddusw %xmm1,%xmm3\n" - "pshufb %xmm5,%xmm3\n" - "paddusw %xmm3,%xmm2\n" - "pmulhuw %xmm6,%xmm2\n" - "packuswb %xmm2,%xmm2\n" - "movd %xmm2,(%edi)\n" - "pextrw $0x2,%xmm2,%eax\n" - "mov %ax,0x4(%edi)\n" - "lea 0x6(%edi),%edi\n" - "sub $0x6,%ecx\n" - "ja 1b\n" - "popa\n" - "ret\n" -); - -extern "C" void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width); - asm( - ".text\n" -#if defined(OSX) - ".globl _ScaleRowDown38_2_Int_SSSE3\n" -"_ScaleRowDown38_2_Int_SSSE3:\n" -#else - ".global ScaleRowDown38_2_Int_SSSE3\n" -"ScaleRowDown38_2_Int_SSSE3:\n" -#endif - "pusha\n" - "mov 0x24(%esp),%esi\n" - "mov 0x28(%esp),%edx\n" - "mov 0x2c(%esp),%edi\n" - "mov 0x30(%esp),%ecx\n" - "movdqa _shufab0,%xmm4\n" - "movdqa _shufab1,%xmm5\n" - "movdqa _shufab2,%xmm6\n" - "movdqa _scaleab2,%xmm7\n" - -"1:" - "movdqa (%esi),%xmm2\n" - "pavgb (%esi,%edx,1),%xmm2\n" - "lea 0x10(%esi),%esi\n" - "movdqa %xmm2,%xmm0\n" - "pshufb %xmm4,%xmm0\n" - "movdqa %xmm2,%xmm1\n" - "pshufb %xmm5,%xmm1\n" - "paddusw %xmm1,%xmm0\n" - "pshufb %xmm6,%xmm2\n" - "paddusw %xmm2,%xmm0\n" - "pmulhuw %xmm7,%xmm0\n" - "packuswb %xmm0,%xmm0\n" - "movd %xmm0,(%edi)\n" - "pextrw $0x2,%xmm0,%eax\n" - "mov %ax,0x4(%edi)\n" - "lea 0x6(%edi),%edi\n" - "sub $0x6,%ecx\n" - "ja 1b\n" - "popa\n" - "ret\n" -); -#endif // __PIC__ - -#define HAS_SCALEADDROWS_SSE2 -extern "C" void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, - uint16* dst_ptr, int src_width, - int src_height); - asm( - ".text\n" -#if defined(OSX) - ".globl _ScaleAddRows_SSE2\n" -"_ScaleAddRows_SSE2:\n" -#else - ".global ScaleAddRows_SSE2\n" -"ScaleAddRows_SSE2:\n" -#endif - "pusha\n" - "mov 0x24(%esp),%esi\n" - "mov 0x28(%esp),%edx\n" - "mov 0x2c(%esp),%edi\n" - "mov 0x30(%esp),%ecx\n" - "mov 0x34(%esp),%ebx\n" - "pxor %xmm7,%xmm7\n" - -"1:" - "movdqa (%esi),%xmm2\n" - "lea (%esi,%edx,1),%eax\n" - "movhlps %xmm2,%xmm3\n" - "lea -0x1(%ebx),%ebp\n" - "punpcklbw %xmm7,%xmm2\n" - "punpcklbw %xmm7,%xmm3\n" - -"2:" - "movdqa (%eax),%xmm0\n" - "lea (%eax,%edx,1),%eax\n" - "movhlps %xmm0,%xmm1\n" - "punpcklbw %xmm7,%xmm0\n" - "punpcklbw %xmm7,%xmm1\n" - "paddusw %xmm0,%xmm2\n" - "paddusw %xmm1,%xmm3\n" - "sub $0x1,%ebp\n" - "ja 2b\n" - - "movdqa %xmm2,(%edi)\n" - "movdqa %xmm3,0x10(%edi)\n" - "lea 0x20(%edi),%edi\n" - "lea 0x10(%esi),%esi\n" - "sub $0x10,%ecx\n" - "ja 1b\n" - "popa\n" - "ret\n" -); - -// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version -#define HAS_SCALEFILTERROWS_SSE2 -extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr, - const uint8* src_ptr, int src_stride, - int dst_width, int source_y_fraction); - asm( - ".text\n" -#if defined(OSX) - ".globl _ScaleFilterRows_SSE2\n" -"_ScaleFilterRows_SSE2:\n" -#else - ".global ScaleFilterRows_SSE2\n" -"ScaleFilterRows_SSE2:\n" -#endif - "push %esi\n" - "push %edi\n" - "mov 0xc(%esp),%edi\n" - "mov 0x10(%esp),%esi\n" - "mov 0x14(%esp),%edx\n" - "mov 0x18(%esp),%ecx\n" - "mov 0x1c(%esp),%eax\n" - "cmp $0x0,%eax\n" - "je 2f\n" - "cmp $0x80,%eax\n" - "je 3f\n" - "movd %eax,%xmm6\n" - "punpcklwd %xmm6,%xmm6\n" - "pshufd $0x0,%xmm6,%xmm6\n" - "neg %eax\n" - "add $0x100,%eax\n" - "movd %eax,%xmm5\n" - "punpcklwd %xmm5,%xmm5\n" - "pshufd $0x0,%xmm5,%xmm5\n" - "pxor %xmm7,%xmm7\n" - -"1:" - "movdqa (%esi),%xmm0\n" - "movdqa (%esi,%edx,1),%xmm2\n" - "lea 0x10(%esi),%esi\n" - "movdqa %xmm0,%xmm1\n" - "movdqa %xmm2,%xmm3\n" - "punpcklbw %xmm7,%xmm0\n" - "punpcklbw %xmm7,%xmm2\n" - "punpckhbw %xmm7,%xmm1\n" - "punpckhbw %xmm7,%xmm3\n" - "pmullw %xmm5,%xmm0\n" - "pmullw %xmm5,%xmm1\n" - "pmullw %xmm6,%xmm2\n" - "pmullw %xmm6,%xmm3\n" - "paddusw %xmm2,%xmm0\n" - "paddusw %xmm3,%xmm1\n" - "psrlw $0x8,%xmm0\n" - "psrlw $0x8,%xmm1\n" - "packuswb %xmm1,%xmm0\n" - "movdqa %xmm0,(%edi)\n" - "lea 0x10(%edi),%edi\n" - "sub $0x10,%ecx\n" - "ja 1b\n" - "mov -0x1(%edi),%al\n" - "mov %al,(%edi)\n" - "pop %edi\n" - "pop %esi\n" - "ret\n" - -"2:" - "movdqa (%esi),%xmm0\n" - "lea 0x10(%esi),%esi\n" - "movdqa %xmm0,(%edi)\n" - "lea 0x10(%edi),%edi\n" - "sub $0x10,%ecx\n" - "ja 2b\n" - - "mov -0x1(%edi),%al\n" - "mov %al,(%edi)\n" - "pop %edi\n" - "pop %esi\n" - "ret\n" - -"3:" - "movdqa (%esi),%xmm0\n" - "movdqa (%esi,%edx,1),%xmm2\n" - "lea 0x10(%esi),%esi\n" - "pavgb %xmm2,%xmm0\n" - "movdqa %xmm0,(%edi)\n" - "lea 0x10(%edi),%edi\n" - "sub $0x10,%ecx\n" - "ja 3b\n" - - "mov -0x1(%edi),%al\n" - "mov %al,(%edi)\n" - "pop %edi\n" - "pop %esi\n" - "ret\n" -); + ); +} -// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version -#define HAS_SCALEFILTERROWS_SSSE3 -extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr, - const uint8* src_ptr, int src_stride, - int dst_width, int source_y_fraction); - asm( - ".text\n" -#if defined(OSX) - ".globl _ScaleFilterRows_SSSE3\n" -"_ScaleFilterRows_SSSE3:\n" -#else - ".global ScaleFilterRows_SSSE3\n" -"ScaleFilterRows_SSSE3:\n" -#endif - "push %esi\n" - "push %edi\n" - "mov 0xc(%esp),%edi\n" - "mov 0x10(%esp),%esi\n" - "mov 0x14(%esp),%edx\n" - "mov 0x18(%esp),%ecx\n" - "mov 0x1c(%esp),%eax\n" - "cmp $0x0,%eax\n" - "je 2f\n" - "cmp $0x80,%eax\n" - "je 3f\n" - "shr %eax\n" - "mov %al,%ah\n" - "neg %al\n" - "add $0x80,%al\n" - "movd %eax,%xmm7\n" - "punpcklwd %xmm7,%xmm7\n" - "pshufd $0x0,%xmm7,%xmm7\n" - -"1:" - "movdqa (%esi),%xmm0\n" - "movdqa (%esi,%edx,1),%xmm2\n" - "lea 0x10(%esi),%esi\n" - "movdqa %xmm0,%xmm1\n" - "punpcklbw %xmm2,%xmm0\n" - "punpckhbw %xmm2,%xmm1\n" - "pmaddubsw %xmm7,%xmm0\n" - "pmaddubsw %xmm7,%xmm1\n" - "psrlw $0x7,%xmm0\n" - "psrlw $0x7,%xmm1\n" - "packuswb %xmm1,%xmm0\n" - "movdqa %xmm0,(%edi)\n" - "lea 0x10(%edi),%edi\n" - "sub $0x10,%ecx\n" - "ja 1b\n" - "mov -0x1(%edi),%al\n" - "mov %al,(%edi)\n" - "pop %edi\n" - "pop %esi\n" - "ret\n" - -"2:" - "movdqa (%esi),%xmm0\n" - "lea 0x10(%esi),%esi\n" - "movdqa %xmm0,(%edi)\n" - "lea 0x10(%edi),%edi\n" - "sub $0x10,%ecx\n" - "ja 2b\n" - "mov -0x1(%edi),%al\n" - "mov %al,(%edi)\n" - "pop %edi\n" - "pop %esi\n" - "ret\n" - -"3:" - "movdqa (%esi),%xmm0\n" - "movdqa (%esi,%edx,1),%xmm2\n" - "lea 0x10(%esi),%esi\n" - "pavgb %xmm2,%xmm0\n" - "movdqa %xmm0,(%edi)\n" - "lea 0x10(%edi),%edi\n" - "sub $0x10,%ecx\n" - "ja 3b\n" - "mov -0x1(%edi),%al\n" - "mov %al,(%edi)\n" - "pop %edi\n" - "pop %esi\n" - "ret\n" -); - -#elif defined(__x86_64__) -static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, +static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { - asm volatile( - "lea (%3,%3,2),%%r10\n" - "pxor %%xmm7,%%xmm7\n" -"1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "movdqa (%0,%3,1),%%xmm2\n" - "movdqa 0x10(%0,%3,1),%%xmm3\n" - "pavgb %%xmm2,%%xmm0\n" - "pavgb %%xmm3,%%xmm1\n" - "movdqa (%0,%3,2),%%xmm2\n" - "movdqa 0x10(%0,%3,2),%%xmm3\n" - "movdqa (%0,%%r10,1),%%xmm4\n" - "movdqa 0x10(%0,%%r10,1),%%xmm5\n" - "lea (%0,%3,4),%%r11\n" - "lea 0x20(%0),%0\n" - "pavgb %%xmm4,%%xmm2\n" - "pavgb %%xmm5,%%xmm3\n" - "pavgb %%xmm2,%%xmm0\n" - "pavgb %%xmm3,%%xmm1\n" - "movdqa 0x0(%%r11),%%xmm2\n" - "movdqa 0x10(%%r11),%%xmm3\n" - "movdqa 0x0(%%r11,%3,1),%%xmm4\n" - "movdqa 0x10(%%r11,%3,1),%%xmm5\n" - "pavgb %%xmm4,%%xmm2\n" - "pavgb %%xmm5,%%xmm3\n" - "movdqa 0x0(%%r11,%3,2),%%xmm4\n" - "movdqa 0x10(%%r11,%3,2),%%xmm5\n" - "movdqa 0x0(%%r11,%%r10,1),%%xmm6\n" - "pavgb %%xmm6,%%xmm4\n" - "movdqa 0x10(%%r11,%%r10,1),%%xmm6\n" - "pavgb %%xmm6,%%xmm5\n" - "pavgb %%xmm4,%%xmm2\n" - "pavgb %%xmm5,%%xmm3\n" - "pavgb %%xmm2,%%xmm0\n" - "pavgb %%xmm3,%%xmm1\n" - "psadbw %%xmm7,%%xmm0\n" - "psadbw %%xmm7,%%xmm1\n" - "pshufd $0xd8,%%xmm0,%%xmm0\n" - "pshufd $0x8d,%%xmm1,%%xmm1\n" - "por %%xmm1,%%xmm0\n" - "psrlw $0x3,%%xmm0\n" - "packuswb %%xmm0,%%xmm0\n" - "packuswb %%xmm0,%%xmm0\n" - "movd %%xmm0,(%1)\n" - "lea 0x4(%1),%1\n" - "sub $0x4,%2\n" - "ja 1b\n" + intptr_t stridex3 = 0; + intptr_t row4 = 0; + asm volatile ( + "lea (%5,%5,2),%3 \n" + "pxor %%xmm7,%%xmm7 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa (%0,%5,1),%%xmm2 \n" + "movdqa 0x10(%0,%5,1),%%xmm3 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa (%0,%5,2),%%xmm2 \n" + "movdqa 0x10(%0,%5,2),%%xmm3 \n" + "movdqa (%0,%3,1),%%xmm4 \n" + "movdqa 0x10(%0,%3,1),%%xmm5 \n" + "lea (%0,%5,4),%4 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm4,%%xmm2 \n" + "pavgb %%xmm5,%%xmm3 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa 0x0(%4),%%xmm2 \n" + "movdqa 0x10(%4),%%xmm3 \n" + "movdqa 0x0(%4,%5,1),%%xmm4 \n" + "movdqa 0x10(%4,%5,1),%%xmm5 \n" + "pavgb %%xmm4,%%xmm2 \n" + "pavgb %%xmm5,%%xmm3 \n" + "movdqa 0x0(%4,%5,2),%%xmm4 \n" + "movdqa 0x10(%4,%5,2),%%xmm5 \n" + "movdqa 0x0(%4,%3,1),%%xmm6 \n" + "pavgb %%xmm6,%%xmm4 \n" + "movdqa 0x10(%4,%3,1),%%xmm6 \n" + "pavgb %%xmm6,%%xmm5 \n" + "pavgb %%xmm4,%%xmm2 \n" + "pavgb %%xmm5,%%xmm3 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "psadbw %%xmm7,%%xmm0 \n" + "psadbw %%xmm7,%%xmm1 \n" + "pshufd $0xd8,%%xmm0,%%xmm0 \n" + "pshufd $0x8d,%%xmm1,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "psrlw $0x3,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,(%1) \n" + "lea 0x4(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(static_cast<intptr_t>(src_stride)) // %3 - : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5", "xmm6", "xmm7" -); + "+rm"(dst_width), // %2 + "+r"(stridex3), // %3 + "+r"(row4) // %4 + : "r"(static_cast<intptr_t>(src_stride)) // %5 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); } #define HAS_SCALEROWDOWN34_SSSE3 -static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, +static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { - asm volatile( - "movdqa (%3),%%xmm3\n" - "movdqa (%4),%%xmm4\n" - "movdqa (%5),%%xmm5\n" -"1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm2\n" - "lea 0x20(%0),%0\n" - "movdqa %%xmm2,%%xmm1\n" - "palignr $0x8,%%xmm0,%%xmm1\n" - "pshufb %%xmm3,%%xmm0\n" - "pshufb %%xmm4,%%xmm1\n" - "pshufb %%xmm5,%%xmm2\n" - "movq %%xmm0,(%1)\n" - "movq %%xmm1,0x8(%1)\n" - "movq %%xmm2,0x10(%1)\n" - "lea 0x18(%1),%1\n" - "sub $0x18,%2\n" - "ja 1b\n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(_shuf0), // %3 - "r"(_shuf1), // %4 - "r"(_shuf2) // %5 - : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -); + asm volatile ( + "movdqa %0,%%xmm3 \n" + "movdqa %1,%%xmm4 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kShuf0), // %0 + "m"(kShuf1), // %1 + "m"(kShuf2) // %2 + ); + asm volatile ( + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm2 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "palignr $0x8,%%xmm0,%%xmm1 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "pshufb %%xmm5,%%xmm2 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x8(%1) \n" + "movq %%xmm2,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); } -static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, +static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { - asm volatile( - "movdqa (%4),%%xmm2\n" // _shuf01 - "movdqa (%5),%%xmm3\n" // _shuf11 - "movdqa (%6),%%xmm4\n" // _shuf21 - "movdqa (%7),%%xmm5\n" // _madd01 - "movdqa (%8),%%xmm6\n" // _madd11 - "movdqa (%9),%%xmm7\n" // _round34 - "movdqa (%10),%%xmm8\n" // _madd21 -"1:" - "movdqa (%0),%%xmm0\n" - "movdqa (%0,%3),%%xmm1\n" - "pavgb %%xmm1,%%xmm0\n" - "pshufb %%xmm2,%%xmm0\n" - "pmaddubsw %%xmm5,%%xmm0\n" - "paddsw %%xmm7,%%xmm0\n" - "psrlw $0x2,%%xmm0\n" - "packuswb %%xmm0,%%xmm0\n" - "movq %%xmm0,(%1)\n" - "movdqu 0x8(%0),%%xmm0\n" - "movdqu 0x8(%0,%3),%%xmm1\n" - "pavgb %%xmm1,%%xmm0\n" - "pshufb %%xmm3,%%xmm0\n" - "pmaddubsw %%xmm6,%%xmm0\n" - "paddsw %%xmm7,%%xmm0\n" - "psrlw $0x2,%%xmm0\n" - "packuswb %%xmm0,%%xmm0\n" - "movq %%xmm0,0x8(%1)\n" - "movdqa 0x10(%0),%%xmm0\n" - "movdqa 0x10(%0,%3),%%xmm1\n" - "lea 0x20(%0),%0\n" - "pavgb %%xmm1,%%xmm0\n" - "pshufb %%xmm4,%%xmm0\n" - "pmaddubsw %%xmm8,%%xmm0\n" - "paddsw %%xmm7,%%xmm0\n" - "psrlw $0x2,%%xmm0\n" - "packuswb %%xmm0,%%xmm0\n" - "movq %%xmm0,0x10(%1)\n" - "lea 0x18(%1),%1\n" - "sub $0x18,%2\n" - "ja 1b\n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 + asm volatile ( + "movdqa %0,%%xmm2 \n" // kShuf01 + "movdqa %1,%%xmm3 \n" // kShuf11 + "movdqa %2,%%xmm4 \n" // kShuf21 + : + : "m"(kShuf01), // %0 + "m"(kShuf11), // %1 + "m"(kShuf21) // %2 + ); + asm volatile ( + "movdqa %0,%%xmm5 \n" // kMadd01 + "movdqa %1,%%xmm0 \n" // kMadd11 + "movdqa %2,%%xmm1 \n" // kRound34 + : + : "m"(kMadd01), // %0 + "m"(kMadd11), // %1 + "m"(kRound34) // %2 + ); + asm volatile ( + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm6 \n" + "movdqa (%0,%3),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,(%1) \n" + "movdqu 0x8(%0),%%xmm6 \n" + "movdqu 0x8(%0,%3),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm0,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x8(%1) \n" + "movdqa 0x10(%0),%%xmm6 \n" + "movdqa 0x10(%0,%3),%%xmm7 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "pmaddubsw %4,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 : "r"(static_cast<intptr_t>(src_stride)), // %3 - "r"(_shuf01), // %4 - "r"(_shuf11), // %5 - "r"(_shuf21), // %6 - "r"(_madd01), // %7 - "r"(_madd11), // %8 - "r"(_round34), // %9 - "r"(_madd21) // %10 - : "memory", "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5", "xmm6", "xmm7", "xmm8" -); -} - -static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, + "m"(kMadd21) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} + +static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { - asm volatile( - "movdqa (%4),%%xmm2\n" // _shuf01 - "movdqa (%5),%%xmm3\n" // _shuf11 - "movdqa (%6),%%xmm4\n" // _shuf21 - "movdqa (%7),%%xmm5\n" // _madd01 - "movdqa (%8),%%xmm6\n" // _madd11 - "movdqa (%9),%%xmm7\n" // _round34 - "movdqa (%10),%%xmm8\n" // _madd21 -"1:" - "movdqa (%0),%%xmm0\n" - "movdqa (%0,%3,1),%%xmm1\n" - "pavgb %%xmm0,%%xmm1\n" - "pavgb %%xmm1,%%xmm0\n" - "pshufb %%xmm2,%%xmm0\n" - "pmaddubsw %%xmm5,%%xmm0\n" - "paddsw %%xmm7,%%xmm0\n" - "psrlw $0x2,%%xmm0\n" - "packuswb %%xmm0,%%xmm0\n" - "movq %%xmm0,(%1)\n" - "movdqu 0x8(%0),%%xmm0\n" - "movdqu 0x8(%0,%3,1),%%xmm1\n" - "pavgb %%xmm0,%%xmm1\n" - "pavgb %%xmm1,%%xmm0\n" - "pshufb %%xmm3,%%xmm0\n" - "pmaddubsw %%xmm6,%%xmm0\n" - "paddsw %%xmm7,%%xmm0\n" - "psrlw $0x2,%%xmm0\n" - "packuswb %%xmm0,%%xmm0\n" - "movq %%xmm0,0x8(%1)\n" - "movdqa 0x10(%0),%%xmm0\n" - "movdqa 0x10(%0,%3,1),%%xmm1\n" - "lea 0x20(%0),%0\n" - "pavgb %%xmm0,%%xmm1\n" - "pavgb %%xmm1,%%xmm0\n" - "pshufb %%xmm4,%%xmm0\n" - "pmaddubsw %%xmm8,%%xmm0\n" - "paddsw %%xmm7,%%xmm0\n" - "psrlw $0x2,%%xmm0\n" - "packuswb %%xmm0,%%xmm0\n" - "movq %%xmm0,0x10(%1)\n" - "lea 0x18(%1),%1\n" - "sub $0x18,%2\n" - "ja 1b\n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(static_cast<intptr_t>(src_stride)), // %3 - "r"(_shuf01), // %4 - "r"(_shuf11), // %5 - "r"(_shuf21), // %6 - "r"(_madd01), // %7 - "r"(_madd11), // %8 - "r"(_round34), // %9 - "r"(_madd21) // %10 - : "memory", "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5", "xmm6", "xmm7", "xmm8" -); + asm volatile ( + "movdqa %0,%%xmm2 \n" // kShuf01 + "movdqa %1,%%xmm3 \n" // kShuf11 + "movdqa %2,%%xmm4 \n" // kShuf21 + : + : "m"(kShuf01), // %0 + "m"(kShuf11), // %1 + "m"(kShuf21) // %2 + ); + asm volatile ( + "movdqa %0,%%xmm5 \n" // kMadd01 + "movdqa %1,%%xmm0 \n" // kMadd11 + "movdqa %2,%%xmm1 \n" // kRound34 + : + : "m"(kMadd01), // %0 + "m"(kMadd11), // %1 + "m"(kRound34) // %2 + ); + + asm volatile ( + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm6 \n" + "movdqa (%0,%3,1),%%xmm7 \n" + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,(%1) \n" + "movdqu 0x8(%0),%%xmm6 \n" + "movdqu 0x8(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm0,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x8(%1) \n" + "movdqa 0x10(%0),%%xmm6 \n" + "movdqa 0x10(%0,%3,1),%%xmm7 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "pmaddubsw %4,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(static_cast<intptr_t>(src_stride)), // %3 + "m"(kMadd21) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); } #define HAS_SCALEROWDOWN38_SSSE3 -static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, +static void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { - asm volatile( - "movdqa (%3),%%xmm5\n" - "movdqa (%4),%%xmm6\n" - "pxor %%xmm7,%%xmm7\n" -"1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "lea 0x20(%0),%0\n" - "pshufb %%xmm5,%%xmm0\n" - "pshufb %%xmm6,%%xmm1\n" - "paddusb %%xmm1,%%xmm0\n" - "movq %%xmm0,(%1)\n" - "movhlps %%xmm0,%%xmm1\n" - "movd %%xmm1,0x8(%1)\n" - "lea 0xc(%1),%1\n" - "sub $0xc,%2\n" - "ja 1b\n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(_shuf38a), // %3 - "r"(_shuf38b) // %4 - : "memory", "xmm0", "xmm1", "xmm5", "xmm6", "xmm7" -); + asm volatile ( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movhlps %%xmm0,%%xmm1 \n" + "movd %%xmm1,0x8(%1) \n" + "lea 0xc(%1),%1 \n" + "sub $0xc,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kShuf38a), // %3 + "m"(kShuf38b) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm4", "xmm5" +#endif + ); } -static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, +static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { - asm volatile( - "movdqa (%4),%%xmm4\n" - "movdqa (%5),%%xmm5\n" - "movdqa (%6),%%xmm6\n" - "pxor %%xmm7,%%xmm7\n" -"1:" - "movdqa (%0),%%xmm0\n" - "movdqa (%0,%3,1),%%xmm2\n" - "movhlps %%xmm0,%%xmm1\n" - "movhlps %%xmm2,%%xmm3\n" - "punpcklbw %%xmm7,%%xmm0\n" - "punpcklbw %%xmm7,%%xmm1\n" - "punpcklbw %%xmm7,%%xmm2\n" - "punpcklbw %%xmm7,%%xmm3\n" - "paddusw %%xmm2,%%xmm0\n" - "paddusw %%xmm3,%%xmm1\n" - "movdqa (%0,%3,2),%%xmm2\n" - "lea 0x10(%0),%0\n" - "movhlps %%xmm2,%%xmm3\n" - "punpcklbw %%xmm7,%%xmm2\n" - "punpcklbw %%xmm7,%%xmm3\n" - "paddusw %%xmm2,%%xmm0\n" - "paddusw %%xmm3,%%xmm1\n" - "movdqa %%xmm0,%%xmm2\n" - "psrldq $0x2,%%xmm0\n" - "paddusw %%xmm0,%%xmm2\n" - "psrldq $0x2,%%xmm0\n" - "paddusw %%xmm0,%%xmm2\n" - "pshufb %%xmm4,%%xmm2\n" - "movdqa %%xmm1,%%xmm3\n" - "psrldq $0x2,%%xmm1\n" - "paddusw %%xmm1,%%xmm3\n" - "psrldq $0x2,%%xmm1\n" - "paddusw %%xmm1,%%xmm3\n" - "pshufb %%xmm5,%%xmm3\n" - "paddusw %%xmm3,%%xmm2\n" - "pmulhuw %%xmm6,%%xmm2\n" - "packuswb %%xmm2,%%xmm2\n" - "movd %%xmm2,(%1)\n" - "pextrw $0x2,%%xmm2,%%eax\n" - "mov %%ax,0x4(%1)\n" - "lea 0x6(%1),%1\n" - "sub $0x6,%2\n" - "ja 1b\n" + asm volatile ( + "movdqa %0,%%xmm2 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm4 \n" + "movdqa %3,%%xmm5 \n" + : + : "m"(kShufAb0), // %0 + "m"(kShufAb1), // %1 + "m"(kShufAb2), // %2 + "m"(kScaleAb2) // %3 + ); + asm volatile ( + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "pavgb (%0,%3,1),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "paddusw %%xmm6,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "paddusw %%xmm0,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "sub $0x6,%2 \n" + "movd %%xmm1,(%1) \n" + "psrlq $0x10,%%xmm1 \n" + "movd %%xmm1,0x2(%1) \n" + "lea 0x6(%1),%1 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 - : "r"(static_cast<intptr_t>(src_stride)), // %3 - "r"(_shufac0), // %4 - "r"(_shufac3), // %5 - "r"(_scaleac3) // %6 - : "memory", "rax", "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5", "xmm6", "xmm7" -); + : "r"(static_cast<intptr_t>(src_stride)) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); } -static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, +static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { - asm volatile( - "movdqa (%4),%%xmm4\n" - "movdqa (%5),%%xmm5\n" - "movdqa (%6),%%xmm6\n" - "movdqa (%7),%%xmm7\n" -"1:" - "movdqa (%0),%%xmm2\n" - "pavgb (%0,%3,1),%%xmm2\n" - "lea 0x10(%0),%0\n" - "movdqa %%xmm2,%%xmm0\n" - "pshufb %%xmm4,%%xmm0\n" - "movdqa %%xmm2,%%xmm1\n" - "pshufb %%xmm5,%%xmm1\n" - "paddusw %%xmm1,%%xmm0\n" - "pshufb %%xmm6,%%xmm2\n" - "paddusw %%xmm2,%%xmm0\n" - "pmulhuw %%xmm7,%%xmm0\n" - "packuswb %%xmm0,%%xmm0\n" - "movd %%xmm0,(%1)\n" - "pextrw $0x2,%%xmm0,%%eax\n" - "mov %%ax,0x4(%1)\n" - "lea 0x6(%1),%1\n" - "sub $0x6,%2\n" - "ja 1b\n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(static_cast<intptr_t>(src_stride)), // %3 - "r"(_shufab0), // %4 - "r"(_shufab1), // %5 - "r"(_shufab2), // %6 - "r"(_scaleab2) // %7 - : "memory", "rax", "xmm0", "xmm1", "xmm2", - "xmm4", "xmm5", "xmm6", "xmm7" -); + asm volatile ( + "movdqa %0,%%xmm2 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + : + : "m"(kShufAc), // %0 + "m"(kShufAc3), // %1 + "m"(kScaleAc33) // %2 + ); + asm volatile ( + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa (%0,%3,1),%%xmm6 \n" + "movhlps %%xmm0,%%xmm1 \n" + "movhlps %%xmm6,%%xmm7 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm6 \n" + "punpcklbw %%xmm5,%%xmm7 \n" + "paddusw %%xmm6,%%xmm0 \n" + "paddusw %%xmm7,%%xmm1 \n" + "movdqa (%0,%3,2),%%xmm6 \n" + "lea 0x10(%0),%0 \n" + "movhlps %%xmm6,%%xmm7 \n" + "punpcklbw %%xmm5,%%xmm6 \n" + "punpcklbw %%xmm5,%%xmm7 \n" + "paddusw %%xmm6,%%xmm0 \n" + "paddusw %%xmm7,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm6 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "movdqa %%xmm1,%%xmm7 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm7 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm7 \n" + "pshufb %%xmm3,%%xmm7 \n" + "paddusw %%xmm7,%%xmm6 \n" + "pmulhuw %%xmm4,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "sub $0x6,%2 \n" + "movd %%xmm6,(%1) \n" + "psrlq $0x10,%%xmm6 \n" + "movd %%xmm6,0x2(%1) \n" + "lea 0x6(%1),%1 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(static_cast<intptr_t>(src_stride)) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); } #define HAS_SCALEADDROWS_SSE2 -static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, - uint16* dst_ptr, int src_width, - int src_height) { - asm volatile( - "pxor %%xmm7,%%xmm7\n" -"1:" - "movdqa (%0),%%xmm2\n" - "lea (%0,%4,1),%%r10\n" - "movhlps %%xmm2,%%xmm3\n" - "lea -0x1(%3),%%r11\n" - "punpcklbw %%xmm7,%%xmm2\n" - "punpcklbw %%xmm7,%%xmm3\n" - -"2:" - "movdqa (%%r10),%%xmm0\n" - "lea (%%r10,%4,1),%%r10\n" - "movhlps %%xmm0,%%xmm1\n" - "punpcklbw %%xmm7,%%xmm0\n" - "punpcklbw %%xmm7,%%xmm1\n" - "paddusw %%xmm0,%%xmm2\n" - "paddusw %%xmm1,%%xmm3\n" - "sub $0x1,%%r11\n" - "ja 2b\n" - - "movdqa %%xmm2,(%1)\n" - "movdqa %%xmm3,0x10(%1)\n" - "lea 0x20(%1),%1\n" - "lea 0x10(%0),%0\n" - "sub $0x10,%2\n" - "ja 1b\n" +static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int src_width, int src_height) { + int tmp_height = 0; + intptr_t tmp_src = 0; + asm volatile ( + "pxor %%xmm4,%%xmm4 \n" + "sub $0x1,%5 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "mov %0,%3 \n" + "add %6,%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm4,%%xmm0 \n" + "punpckhbw %%xmm4,%%xmm1 \n" + "mov %5,%2 \n" + "test %2,%2 \n" + "je 3f \n" + "2: \n" + "movdqa (%0),%%xmm2 \n" + "add %6,%0 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm4,%%xmm2 \n" + "punpckhbw %%xmm4,%%xmm3 \n" + "paddusw %%xmm2,%%xmm0 \n" + "paddusw %%xmm3,%%xmm1 \n" + "sub $0x1,%2 \n" + "jg 2b \n" + "3: \n" + "movdqa %%xmm0,(%1) \n" + "movdqa %%xmm1,0x10(%1) \n" + "lea 0x10(%3),%0 \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%4 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 - "+r"(src_width), // %2 - "+r"(src_height) // %3 - : "r"(static_cast<intptr_t>(src_stride)) // %4 - : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7" -); + "+r"(tmp_height), // %2 + "+r"(tmp_src), // %3 + "+r"(src_width), // %4 + "+rm"(src_height) // %5 + : "rm"(static_cast<intptr_t>(src_stride)) // %6 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" +#endif + ); } +#ifndef SSE2_DISABLED // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version -#define HAS_SCALEFILTERROWS_SSE2 +#define HAS_SCALEFILTERROWS_SSE2_DISABLED static void ScaleFilterRows_SSE2(uint8* dst_ptr, - const uint8* src_ptr, int src_stride, + const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { - if (source_y_fraction == 0) { - asm volatile( - "1:" - "movdqa (%1),%%xmm0\n" - "lea 0x10(%1),%1\n" - "movdqa %%xmm0,(%0)\n" - "lea 0x10(%0),%0\n" - "sub $0x10,%2\n" - "ja 1b\n" - "mov -0x1(%0),%%al\n" - "mov %%al,(%0)\n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "rax", "xmm0" - ); - return; - } else if (source_y_fraction == 128) { - asm volatile( - "1:" - "movdqa (%1),%%xmm0\n" - "movdqa (%1,%3,1),%%xmm2\n" - "lea 0x10(%1),%1\n" - "pavgb %%xmm2,%%xmm0\n" - "movdqa %%xmm0,(%0)\n" - "lea 0x10(%0),%0\n" - "sub $0x10,%2\n" - "ja 1b\n" - "mov -0x1(%0),%%al\n" - "mov %%al,(%0)\n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(static_cast<intptr_t>(src_stride)) // %3 - : "memory", "rax", "xmm0", "xmm2" - ); - return; - } else { - asm volatile( - "mov %3,%%eax\n" - "movd %%eax,%%xmm6\n" - "punpcklwd %%xmm6,%%xmm6\n" - "pshufd $0x0,%%xmm6,%%xmm6\n" - "neg %%eax\n" - "add $0x100,%%eax\n" - "movd %%eax,%%xmm5\n" - "punpcklwd %%xmm5,%%xmm5\n" - "pshufd $0x0,%%xmm5,%%xmm5\n" - "pxor %%xmm7,%%xmm7\n" - "1:" - "movdqa (%1),%%xmm0\n" - "movdqa (%1,%4,1),%%xmm2\n" - "lea 0x10(%1),%1\n" - "movdqa %%xmm0,%%xmm1\n" - "movdqa %%xmm2,%%xmm3\n" - "punpcklbw %%xmm7,%%xmm0\n" - "punpcklbw %%xmm7,%%xmm2\n" - "punpckhbw %%xmm7,%%xmm1\n" - "punpckhbw %%xmm7,%%xmm3\n" - "pmullw %%xmm5,%%xmm0\n" - "pmullw %%xmm5,%%xmm1\n" - "pmullw %%xmm6,%%xmm2\n" - "pmullw %%xmm6,%%xmm3\n" - "paddusw %%xmm2,%%xmm0\n" - "paddusw %%xmm3,%%xmm1\n" - "psrlw $0x8,%%xmm0\n" - "psrlw $0x8,%%xmm1\n" - "packuswb %%xmm1,%%xmm0\n" - "movdqa %%xmm0,(%0)\n" - "lea 0x10(%0),%0\n" - "sub $0x10,%2\n" - "ja 1b\n" - "mov -0x1(%0),%%al\n" - "mov %%al,(%0)\n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(source_y_fraction) // %3 - : "r"(static_cast<intptr_t>(src_stride)) // %4 - : "memory", "rax", "xmm0", "xmm1", "xmm2", "xmm3", - "xmm5", "xmm6", "xmm7" - ); - } - return; + asm volatile ( + "sub %1,%0 \n" + "cmp $0x0,%3 \n" + "je 2f \n" + "cmp $0x80,%3 \n" + "je 3f \n" + "movd %3,%%xmm5 \n" + "punpcklbw %%xmm5,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%1),%%xmm0 \n" + "movdqa (%1,%4,1),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm4,%%xmm2 \n" + "punpckhbw %%xmm4,%%xmm3 \n" + "punpcklbw %%xmm4,%%xmm0 \n" + "punpckhbw %%xmm4,%%xmm1 \n" + "psubw %%xmm0,%%xmm2 \n" + "psubw %%xmm1,%%xmm3 \n" + "pmulhw %%xmm5,%%xmm2 \n" + "pmulhw %%xmm5,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0,(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "jg 1b \n" + "jmp 4f \n" + ".p2align 4 \n" + "2: \n" + "movdqa (%1),%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0,(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "jg 2b \n" + "jmp 4f \n" + ".p2align 4 \n" + "3: \n" + "movdqa (%1),%%xmm0 \n" + "pavgb (%1,%4,1),%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0,(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "jg 3b \n" + ".p2align 4 \n" + "4: \n" + "punpckhbw %%xmm0,%%xmm0 \n" + "pshufhw $0xff,%%xmm0,%%xmm0 \n" + "punpckhqdq %%xmm0,%%xmm0 \n" + "movdqa %%xmm0,(%1,%0,1) \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(source_y_fraction) // %3 + : "r"(static_cast<intptr_t>(src_stride)) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); } +#endif // SSE2_DISABLED // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version #define HAS_SCALEFILTERROWS_SSSE3 static void ScaleFilterRows_SSSE3(uint8* dst_ptr, - const uint8* src_ptr, int src_stride, + const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { - if (source_y_fraction == 0) { - asm volatile( - "1:" - "movdqa (%1),%%xmm0\n" - "lea 0x10(%1),%1\n" - "movdqa %%xmm0,(%0)\n" - "lea 0x10(%0),%0\n" - "sub $0x10,%2\n" - "ja 1b\n" - "mov -0x1(%0),%%al\n" - "mov %%al,(%0)\n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "rax", "xmm0" - ); - return; - } else if (source_y_fraction == 128) { - asm volatile( - "1:" - "movdqa (%1),%%xmm0\n" - "movdqa (%1,%3,1),%%xmm2\n" - "lea 0x10(%1),%1\n" - "pavgb %%xmm2,%%xmm0\n" - "movdqa %%xmm0,(%0)\n" - "lea 0x10(%0),%0\n" - "sub $0x10,%2\n" - "ja 1b\n" - "mov -0x1(%0),%%al\n" - "mov %%al,(%0)\n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(static_cast<intptr_t>(src_stride)) // %3 - : "memory", "rax", "xmm0", "xmm2" - ); - return; - } else { - asm volatile( - "mov %3,%%eax\n" - "shr %%eax\n" - "mov %%al,%%ah\n" - "neg %%al\n" - "add $0x80,%%al\n" - "movd %%eax,%%xmm7\n" - "punpcklwd %%xmm7,%%xmm7\n" - "pshufd $0x0,%%xmm7,%%xmm7\n" - "1:" - "movdqa (%1),%%xmm0\n" - "movdqa (%1,%4,1),%%xmm2\n" - "lea 0x10(%1),%1\n" - "movdqa %%xmm0,%%xmm1\n" - "punpcklbw %%xmm2,%%xmm0\n" - "punpckhbw %%xmm2,%%xmm1\n" - "pmaddubsw %%xmm7,%%xmm0\n" - "pmaddubsw %%xmm7,%%xmm1\n" - "psrlw $0x7,%%xmm0\n" - "psrlw $0x7,%%xmm1\n" - "packuswb %%xmm1,%%xmm0\n" - "movdqa %%xmm0,(%0)\n" - "lea 0x10(%0),%0\n" - "sub $0x10,%2\n" - "ja 1b\n" - "mov -0x1(%0),%%al\n" - "mov %%al,(%0)\n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(source_y_fraction) // %3 - : "r"(static_cast<intptr_t>(src_stride)) // %4 - : "memory", "rax", "xmm0", "xmm1", "xmm2", "xmm7" - ); - } - return; -} -#endif + asm volatile ( + "sub %1,%0 \n" + "shr %3 \n" + "cmp $0x0,%3 \n" + "je 2f \n" + "cmp $0x40,%3 \n" + "je 3f \n" + "movd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x80,%3 \n" + "movd %3,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%1),%%xmm0 \n" + "movdqa (%1,%4,1),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "pmaddubsw %%xmm5,%%xmm0 \n" + "pmaddubsw %%xmm5,%%xmm1 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0,(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "jg 1b \n" + "jmp 4f \n" + ".p2align 4 \n" + "2: \n" + "movdqa (%1),%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0,(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "jg 2b \n" + "jmp 4f \n" + ".p2align 4 \n" + "3: \n" + "movdqa (%1),%%xmm0 \n" + "pavgb (%1,%4,1),%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0,(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "jg 3b \n" + ".p2align 4 \n" + "4: \n" + "punpckhbw %%xmm0,%%xmm0 \n" + "pshufhw $0xff,%%xmm0,%%xmm0 \n" + "punpckhqdq %%xmm0,%%xmm0 \n" + "movdqa %%xmm0,(%1,%0,1) \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(source_y_fraction) // %3 + : "r"(static_cast<intptr_t>(src_stride)) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm5" #endif + ); +} +#endif // defined(__x86_64__) || defined(__i386__) // CPU agnostic row functions -static void ScaleRowDown2_C(const uint8* src_ptr, int, +static void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, uint8* dst, int dst_width) { - for (int x = 0; x < dst_width; ++x) { - *dst++ = *src_ptr; - src_ptr += 2; + uint8* dend = dst + dst_width - 1; + do { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[2]; + dst += 2; + src_ptr += 4; + } while (dst < dend); + if (dst_width & 1) { + dst[0] = src_ptr[0]; } } -static void ScaleRowDown2Int_C(const uint8* src_ptr, int src_stride, - uint8* dst, int dst_width) { - for (int x = 0; x < dst_width; ++x) { - *dst++ = (src_ptr[0] + src_ptr[1] + - src_ptr[src_stride] + src_ptr[src_stride + 1] + 2) >> 2; - src_ptr += 2; +void ScaleRowDown2Int_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + const uint8* s = src_ptr; + const uint8* t = src_ptr + src_stride; + uint8* dend = dst + dst_width - 1; + do { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2; + dst += 2; + s += 4; + t += 4; + } while (dst < dend); + if (dst_width & 1) { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; } } -static void ScaleRowDown4_C(const uint8* src_ptr, int, +static void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, uint8* dst, int dst_width) { - for (int x = 0; x < dst_width; ++x) { - *dst++ = *src_ptr; - src_ptr += 4; + uint8* dend = dst + dst_width - 1; + do { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[4]; + dst += 2; + src_ptr += 8; + } while (dst < dend); + if (dst_width & 1) { + dst[0] = src_ptr[0]; } } -static void ScaleRowDown4Int_C(const uint8* src_ptr, int src_stride, +static void ScaleRowDown4Int_C(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { - for (int x = 0; x < dst_width; ++x) { - *dst++ = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + - src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + - src_ptr[src_stride + 2] + src_ptr[src_stride + 3] + - src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] + - src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] + - src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] + - src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] + - 8) >> 4; - src_ptr += 4; + intptr_t stride = src_stride; + uint8* dend = dst + dst_width - 1; + do { + dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2] + src_ptr[stride + 3] + + src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + + src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + + src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + + 8) >> 4; + dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] + + src_ptr[stride + 4] + src_ptr[stride + 5] + + src_ptr[stride + 6] + src_ptr[stride + 7] + + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] + + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] + + src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] + + src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] + + 8) >> 4; + dst += 2; + src_ptr += 8; + } while (dst < dend); + if (dst_width & 1) { + dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2] + src_ptr[stride + 3] + + src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + + src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + + src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + + 8) >> 4; } } @@ -2493,19 +2033,25 @@ static void ScaleRowDown4Int_C(const uint8* src_ptr, int src_stride, static const int kMaxOutputWidth = 640; static const int kMaxRow12 = kMaxOutputWidth * 2; -static void ScaleRowDown8_C(const uint8* src_ptr, int, +static void ScaleRowDown8_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, uint8* dst, int dst_width) { - for (int x = 0; x < dst_width; ++x) { - *dst++ = *src_ptr; - src_ptr += 8; + uint8* dend = dst + dst_width - 1; + do { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[8]; + dst += 2; + src_ptr += 16; + } while (dst < dend); + if (dst_width & 1) { + dst[0] = src_ptr[0]; } } // Note calling code checks width is less than max and if not // uses ScaleRowDown8_C instead. -static void ScaleRowDown8Int_C(const uint8* src_ptr, int src_stride, +static void ScaleRowDown8Int_C(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { - ALIGN16(uint8 src_row[kMaxRow12 * 2]); + SIMD_ALIGNED(uint8 src_row[kMaxRow12 * 2]); assert(dst_width <= kMaxOutputWidth); ScaleRowDown4Int_C(src_ptr, src_stride, src_row, dst_width * 2); ScaleRowDown4Int_C(src_ptr + src_stride * 4, src_stride, @@ -2514,7 +2060,7 @@ static void ScaleRowDown8Int_C(const uint8* src_ptr, int src_stride, ScaleRowDown2Int_C(src_row, kMaxOutputWidth, dst, dst_width); } -static void ScaleRowDown34_C(const uint8* src_ptr, int, +static void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, uint8* dst, int dst_width) { assert((dst_width % 3 == 0) && (dst_width > 0)); uint8* dend = dst + dst_width; @@ -2528,12 +2074,12 @@ static void ScaleRowDown34_C(const uint8* src_ptr, int, } // Filter rows 0 and 1 together, 3 : 1 -static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride, +static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, ptrdiff_t src_stride, uint8* d, int dst_width) { assert((dst_width % 3 == 0) && (dst_width > 0)); - uint8* dend = d + dst_width; const uint8* s = src_ptr; const uint8* t = src_ptr + src_stride; + uint8* dend = d + dst_width; do { uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; @@ -2551,12 +2097,12 @@ static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride, } // Filter rows 1 and 2 together, 1 : 1 -static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride, +static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, ptrdiff_t src_stride, uint8* d, int dst_width) { assert((dst_width % 3 == 0) && (dst_width > 0)); - uint8* dend = d + dst_width; const uint8* s = src_ptr; const uint8* t = src_ptr + src_stride; + uint8* dend = d + dst_width; do { uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; @@ -2573,13 +2119,42 @@ static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride, } while (d < dend); } +// (1-f)a + fb can be replaced with a + f(b-a) +#define BLENDER(a, b, f) (static_cast<int>(a) + \ + ((f) * (static_cast<int>(b) - static_cast<int>(a)) >> 16)) + +static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + for (int j = 0; j < dst_width - 1; j += 2) { + int xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + x += dx; + xi = x >> 16; + a = src_ptr[xi]; + b = src_ptr[xi + 1]; + dst_ptr[1] = BLENDER(a, b, x & 0xffff); + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + int xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + } +} + +static const int kMaxInputWidth = 2560; + #if defined(HAS_SCALEFILTERROWS_SSE2) // Filter row to 3/4 static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr, int dst_width) { assert((dst_width % 3 == 0) && (dst_width > 0)); - uint8* dend = dst_ptr + dst_width; const uint8* s = src_ptr; + uint8* dend = dst_ptr + dst_width; do { dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2; dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1; @@ -2588,45 +2163,30 @@ static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr, s += 4; } while (dst_ptr < dend); } -#endif - -static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int dx) { - int x = 0; - for (int j = 0; j < dst_width; ++j) { - int xi = x >> 16; - int xf1 = x & 0xffff; - int xf0 = 65536 - xf1; - - *dst_ptr++ = (src_ptr[xi] * xf0 + src_ptr[xi + 1] * xf1) >> 16; - x += dx; - } -} -static const int kMaxInputWidth = 2560; -#if defined(HAS_SCALEFILTERROWS_SSE2) -#define HAS_SCALEROWDOWN34_SSE2 +#define HAS_SCALEROWDOWN34_SSE2_DISABLED // Filter rows 0 and 1 together, 3 : 1 -static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr, int src_stride, +static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { assert((dst_width % 3 == 0) && (dst_width > 0)); - ALIGN16(uint8 row[kMaxInputWidth]); - ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, - 256 / 4); + SIMD_ALIGNED(uint8 row[kMaxInputWidth]); + ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 4); ScaleFilterCols34_C(dst_ptr, row, dst_width); } // Filter rows 1 and 2 together, 1 : 1 -static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr, int src_stride, +static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { assert((dst_width % 3 == 0) && (dst_width > 0)); - ALIGN16(uint8 row[kMaxInputWidth]); + SIMD_ALIGNED(uint8 row[kMaxInputWidth]); ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 2); ScaleFilterCols34_C(dst_ptr, row, dst_width); } #endif -static void ScaleRowDown38_C(const uint8* src_ptr, int, +static void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, uint8* dst, int dst_width) { assert(dst_width % 3 == 0); for (int x = 0; x < dst_width; x += 3) { @@ -2639,23 +2199,25 @@ static void ScaleRowDown38_C(const uint8* src_ptr, int, } // 8x3 -> 3x1 -static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride, +static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, + ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { assert((dst_width % 3 == 0) && (dst_width > 0)); - for (int i = 0; i < dst_width; i+=3) { + intptr_t stride = src_stride; + for (int i = 0; i < dst_width; i += 3) { dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + - src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + - src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] + - src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) * + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * (65536 / 9) >> 16; dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + - src_ptr[src_stride + 3] + src_ptr[src_stride + 4] + - src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] + - src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) * + src_ptr[stride + 3] + src_ptr[stride + 4] + + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * (65536 / 9) >> 16; dst_ptr[2] = (src_ptr[6] + src_ptr[7] + - src_ptr[src_stride + 6] + src_ptr[src_stride + 7] + - src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7]) * + src_ptr[stride + 6] + src_ptr[stride + 7] + + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * (65536 / 6) >> 16; src_ptr += 8; dst_ptr += 3; @@ -2663,18 +2225,19 @@ static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride, } // 8x2 -> 3x1 -static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, int src_stride, +static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { assert((dst_width % 3 == 0) && (dst_width > 0)); - for (int i = 0; i < dst_width; i+=3) { + intptr_t stride = src_stride; + for (int i = 0; i < dst_width; i += 3) { dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + - src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + - src_ptr[src_stride + 2]) * (65536 / 6) >> 16; + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2]) * (65536 / 6) >> 16; dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + - src_ptr[src_stride + 3] + src_ptr[src_stride + 4] + - src_ptr[src_stride + 5]) * (65536 / 6) >> 16; + src_ptr[stride + 3] + src_ptr[stride + 4] + + src_ptr[stride + 5]) * (65536 / 6) >> 16; dst_ptr[2] = (src_ptr[6] + src_ptr[7] + - src_ptr[src_stride + 6] + src_ptr[src_stride + 7]) * + src_ptr[stride + 6] + src_ptr[stride + 7]) * (65536 / 4) >> 16; src_ptr += 8; dst_ptr += 3; @@ -2683,7 +2246,7 @@ static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, int src_stride, // C version 8x2 -> 8x1 static void ScaleFilterRows_C(uint8* dst_ptr, - const uint8* src_ptr, int src_stride, + const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { assert(dst_width > 0); int y1_fraction = source_y_fraction; @@ -2706,7 +2269,7 @@ static void ScaleFilterRows_C(uint8* dst_ptr, dst_ptr[0] = dst_ptr[-1]; } -void ScaleAddRows_C(const uint8* src_ptr, int src_stride, +void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride, uint16* dst_ptr, int src_width, int src_height) { assert(src_width > 0); assert(src_height > 0); @@ -2728,35 +2291,31 @@ void ScaleAddRows_C(const uint8* src_ptr, int src_stride, * its original size. * */ -static void ScalePlaneDown2(int src_width, int src_height, +static void ScalePlaneDown2(int /* src_width */, int /* src_height */, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8* src_ptr, uint8* dst_ptr, FilterMode filtering) { - assert(src_width % 2 == 0); - assert(src_height % 2 == 0); - void (*ScaleRowDown2)(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width); - + void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) = + filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C; #if defined(HAS_SCALEROWDOWN2_NEON) - if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) && - (dst_width % 16 == 0) && (src_stride % 16 == 0) && - (dst_stride % 16 == 0) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 16)) { + if (TestCpuFlag(kCpuHasNEON) && + IS_ALIGNED(dst_width, 16)) { ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON; - } else -#endif -#if defined(HAS_SCALEROWDOWN2_SSE2) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && - (dst_width % 16 == 0) && IS_ALIGNED(src_ptr, 16) && - IS_ALIGNED(dst_ptr, 16)) { - ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2; - } else -#endif - { - ScaleRowDown2 = filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C; } +#elif defined(HAS_SCALEROWDOWN2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) { + ScaleRowDown2 = filtering ? ScaleRowDown2Int_Unaligned_SSE2 : + ScaleRowDown2_Unaligned_SSE2; + if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2; + } + } +#endif + // TODO(fbarchard): Loop through source height to allow odd height. for (int y = 0; y < dst_height; ++y) { ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width); src_ptr += (src_stride << 1); @@ -2770,34 +2329,26 @@ static void ScalePlaneDown2(int src_width, int src_height, * This is an optimized version for scaling down a plane to 1/4 of * its original size. */ -static void ScalePlaneDown4(int src_width, int src_height, +static void ScalePlaneDown4(int /* src_width */, int /* src_height */, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8* src_ptr, uint8* dst_ptr, FilterMode filtering) { - assert(src_width % 4 == 0); - assert(src_height % 4 == 0); - void (*ScaleRowDown4)(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width); - + void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) = + filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C; #if defined(HAS_SCALEROWDOWN4_NEON) - if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) && - (dst_width % 2 == 0) && (src_stride % 8 == 0) && - IS_ALIGNED(src_ptr, 8)) { + if (TestCpuFlag(kCpuHasNEON) && + IS_ALIGNED(dst_width, 4)) { ScaleRowDown4 = filtering ? ScaleRowDown4Int_NEON : ScaleRowDown4_NEON; - } else -#endif -#if defined(HAS_SCALEROWDOWN4_SSE2) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && - (dst_width % 8 == 0) && (src_stride % 16 == 0) && - (dst_stride % 8 == 0) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) { + } +#elif defined(HAS_SCALEROWDOWN4_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(dst_width, 8) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2; - } else -#endif - { - ScaleRowDown4 = filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C; } +#endif for (int y = 0; y < dst_height; ++y) { ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width); @@ -2813,27 +2364,23 @@ static void ScalePlaneDown4(int src_width, int src_height, * of its original size. * */ -static void ScalePlaneDown8(int src_width, int src_height, +static void ScalePlaneDown8(int /* src_width */, int /* src_height */, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8* src_ptr, uint8* dst_ptr, FilterMode filtering) { - assert(src_width % 8 == 0); - assert(src_height % 8 == 0); - void (*ScaleRowDown8)(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width); + void (*ScaleRowDown8)(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) = + filtering && (dst_width <= kMaxOutputWidth) ? + ScaleRowDown8Int_C : ScaleRowDown8_C; #if defined(HAS_SCALEROWDOWN8_SSE2) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && - (dst_width % 16 == 0) && dst_width <= kMaxOutputWidth && - (src_stride % 16 == 0) && (dst_stride % 16 == 0) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(dst_width, 4) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { ScaleRowDown8 = filtering ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2; - } else -#endif - { - ScaleRowDown8 = filtering && (dst_width <= kMaxOutputWidth) ? - ScaleRowDown8Int_C : ScaleRowDown8_C; } +#endif + for (int y = 0; y < dst_height; ++y) { ScaleRowDown8(src_ptr, src_stride, dst_ptr, dst_width); src_ptr += (src_stride << 3); @@ -2847,72 +2394,75 @@ static void ScalePlaneDown8(int src_width, int src_height, * Provided by Frank Barchard (fbarchard@google.com) * */ -static void ScalePlaneDown34(int src_width, int src_height, +static void ScalePlaneDown34(int /* src_width */, int /* src_height */, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8* src_ptr, uint8* dst_ptr, FilterMode filtering) { assert(dst_width % 3 == 0); - void (*ScaleRowDown34_0)(const uint8* src_ptr, int src_stride, + void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); - void (*ScaleRowDown34_1)(const uint8* src_ptr, int src_stride, + void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); -#if defined(HAS_SCALEROWDOWN34_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && - (dst_width % 24 == 0) && (src_stride % 16 == 0) && - (dst_stride % 8 == 0) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_C; + ScaleRowDown34_1 = ScaleRowDown34_C; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Int_C; + ScaleRowDown34_1 = ScaleRowDown34_1_Int_C; + } +#if defined(HAS_SCALEROWDOWN34_NEON) + if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) { if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_SSSE3; - ScaleRowDown34_1 = ScaleRowDown34_SSSE3; + ScaleRowDown34_0 = ScaleRowDown34_NEON; + ScaleRowDown34_1 = ScaleRowDown34_NEON; } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3; - ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3; + ScaleRowDown34_0 = ScaleRowDown34_0_Int_NEON; + ScaleRowDown34_1 = ScaleRowDown34_1_Int_NEON; } - } else + } #endif #if defined(HAS_SCALEROWDOWN34_SSE2) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && - (dst_width % 24 == 0) && (src_stride % 16 == 0) && - (dst_stride % 8 == 0) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8) && - filtering) { + if (TestCpuFlag(kCpuHasSSE2) && (dst_width % 24 == 0) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && filtering) { ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2; ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2; - } else + } #endif - { +#if defined(HAS_SCALEROWDOWN34_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_C; - ScaleRowDown34_1 = ScaleRowDown34_C; + ScaleRowDown34_0 = ScaleRowDown34_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_SSSE3; } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Int_C; - ScaleRowDown34_1 = ScaleRowDown34_1_Int_C; + ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3; } } - int src_row = 0; - for (int y = 0; y < dst_height; ++y) { - switch (src_row) { - case 0: - ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width); - break; - - case 1: - ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width); - break; - - case 2: - ScaleRowDown34_0(src_ptr + src_stride, -src_stride, - dst_ptr, dst_width); - break; - } - ++src_row; +#endif + + for (int y = 0; y < dst_height - 2; y += 3) { + ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width); src_ptr += src_stride; dst_ptr += dst_stride; - if (src_row >= 3) { - src_ptr += src_stride; - src_row = 0; - } + ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += src_stride; + dst_ptr += dst_stride; + ScaleRowDown34_0(src_ptr + src_stride, -src_stride, + dst_ptr, dst_width); + src_ptr += src_stride * 2; + dst_ptr += dst_stride; + } + + // Remainder 1 or 2 rows with last row vertically unfiltered + if ((dst_height % 3) == 2) { + ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += src_stride; + dst_ptr += dst_stride; + ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width); + } else if ((dst_height % 3) == 1) { + ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width); } } @@ -2922,23 +2472,47 @@ static void ScalePlaneDown34(int src_width, int src_height, * This is an optimized version for scaling down a plane to 3/8 * of its original size. * - * Reduces 16x3 to 6x1 + * Uses box filter arranges like this + * aaabbbcc -> abc + * aaabbbcc def + * aaabbbcc ghi + * dddeeeff + * dddeeeff + * dddeeeff + * ggghhhii + * ggghhhii + * Boxes are 3x3, 2x3, 3x2 and 2x2 */ -static void ScalePlaneDown38(int src_width, int src_height, +static void ScalePlaneDown38(int /* src_width */, int /* src_height */, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8* src_ptr, uint8* dst_ptr, FilterMode filtering) { assert(dst_width % 3 == 0); - void (*ScaleRowDown38_3)(const uint8* src_ptr, int src_stride, + void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); - void (*ScaleRowDown38_2)(const uint8* src_ptr, int src_stride, + void (*ScaleRowDown38_2)(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); -#if defined(HAS_SCALEROWDOWN38_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && - (dst_width % 24 == 0) && (src_stride % 16 == 0) && - (dst_stride % 8 == 0) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_C; + ScaleRowDown38_2 = ScaleRowDown38_C; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Int_C; + ScaleRowDown38_2 = ScaleRowDown38_2_Int_C; + } +#if defined(HAS_SCALEROWDOWN38_NEON) + if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_NEON; + ScaleRowDown38_2 = ScaleRowDown38_NEON; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Int_NEON; + ScaleRowDown38_2 = ScaleRowDown38_2_Int_NEON; + } + } +#elif defined(HAS_SCALEROWDOWN38_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { if (!filtering) { ScaleRowDown38_3 = ScaleRowDown38_SSSE3; ScaleRowDown38_2 = ScaleRowDown38_SSSE3; @@ -2946,39 +2520,34 @@ static void ScalePlaneDown38(int src_width, int src_height, ScaleRowDown38_3 = ScaleRowDown38_3_Int_SSSE3; ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3; } - } else + } #endif - { - if (!filtering) { - ScaleRowDown38_3 = ScaleRowDown38_C; - ScaleRowDown38_2 = ScaleRowDown38_C; - } else { - ScaleRowDown38_3 = ScaleRowDown38_3_Int_C; - ScaleRowDown38_2 = ScaleRowDown38_2_Int_C; - } + + for (int y = 0; y < dst_height - 2; y += 3) { + ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + dst_ptr += dst_stride; + ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + dst_ptr += dst_stride; + ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += src_stride * 2; + dst_ptr += dst_stride; } - int src_row = 0; - for (int y = 0; y < dst_height; ++y) { - switch (src_row) { - case 0: - case 1: - ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width); - src_ptr += src_stride * 3; - ++src_row; - break; - - case 2: - ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width); - src_ptr += src_stride * 2; - src_row = 0; - break; - } + + // Remainder 1 or 2 rows with last row vertically unfiltered + if ((dst_height % 3) == 2) { + ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; dst_ptr += dst_stride; + ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); + } else if ((dst_height % 3) == 1) { + ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); } } -inline static uint32 SumBox(int iboxwidth, int iboxheight, - int src_stride, const uint8* src_ptr) { +static __inline uint32 SumBox(int iboxwidth, int iboxheight, + ptrdiff_t src_stride, const uint8* src_ptr) { assert(iboxwidth > 0); assert(iboxheight > 0); uint32 sum = 0u; @@ -2991,10 +2560,9 @@ inline static uint32 SumBox(int iboxwidth, int iboxheight, return sum; } -static void ScalePlaneBoxRow(int dst_width, int boxheight, - int dx, int src_stride, - const uint8* src_ptr, uint8* dst_ptr) { - int x = 0; +static void ScalePlaneBoxRow_C(int dst_width, int boxheight, + int x, int dx, ptrdiff_t src_stride, + const uint8* src_ptr, uint8* dst_ptr) { for (int i = 0; i < dst_width; ++i) { int ix = x >> 16; x += dx; @@ -3004,7 +2572,7 @@ static void ScalePlaneBoxRow(int dst_width, int boxheight, } } -inline static uint32 SumPixels(int iboxwidth, const uint16* src_ptr) { +static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) { assert(iboxwidth > 0); uint32 sum = 0u; for (int x = 0; x < iboxwidth; ++x) { @@ -3013,14 +2581,13 @@ inline static uint32 SumPixels(int iboxwidth, const uint16* src_ptr) { return sum; } -static void ScaleAddCols2_C(int dst_width, int boxheight, int dx, +static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx, const uint16* src_ptr, uint8* dst_ptr) { int scaletbl[2]; int minboxwidth = (dx >> 16); scaletbl[0] = 65536 / (minboxwidth * boxheight); scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight); int *scaleptr = scaletbl - minboxwidth; - int x = 0; for (int i = 0; i < dst_width; ++i) { int ix = x >> 16; x += dx; @@ -3029,11 +2596,10 @@ static void ScaleAddCols2_C(int dst_width, int boxheight, int dx, } } -static void ScaleAddCols1_C(int dst_width, int boxheight, int dx, +static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx, const uint16* src_ptr, uint8* dst_ptr) { int boxwidth = (dx >> 16); int scaleval = 65536 / (boxwidth * boxheight); - int x = 0; for (int i = 0; i < dst_width; ++i) { *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16; x += boxwidth; @@ -3055,61 +2621,56 @@ static void ScalePlaneBox(int src_width, int src_height, const uint8* src_ptr, uint8* dst_ptr) { assert(dst_width > 0); assert(dst_height > 0); - int dy = (src_height << 16) / dst_height; int dx = (src_width << 16) / dst_width; - if ((src_width % 16 != 0) || (src_width > kMaxInputWidth) || + int dy = (src_height << 16) / dst_height; + int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1); + int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1); + int maxy = (src_height << 16); + if (!IS_ALIGNED(src_width, 16) || (src_width > kMaxInputWidth) || dst_height * 2 > src_height) { uint8* dst = dst_ptr; - int dy = (src_height << 16) / dst_height; - int dx = (src_width << 16) / dst_width; - int y = 0; for (int j = 0; j < dst_height; ++j) { int iy = y >> 16; - const uint8* const src = src_ptr + iy * src_stride; + const uint8* src = src_ptr + iy * src_stride; y += dy; - if (y > (src_height << 16)) { - y = (src_height << 16); + if (y > maxy) { + y = maxy; } int boxheight = (y >> 16) - iy; - ScalePlaneBoxRow(dst_width, boxheight, - dx, src_stride, - src, dst); - + ScalePlaneBoxRow_C(dst_width, boxheight, + x, dx, src_stride, + src, dst); dst += dst_stride; } } else { - ALIGN16(uint16 row[kMaxInputWidth]); - void (*ScaleAddRows)(const uint8* src_ptr, int src_stride, - uint16* dst_ptr, int src_width, int src_height); - void (*ScaleAddCols)(int dst_width, int boxheight, int dx, + SIMD_ALIGNED(uint16 row[kMaxInputWidth]); + void (*ScaleAddRows)(const uint8* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int src_width, int src_height)= + ScaleAddRows_C; + void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx, const uint16* src_ptr, uint8* dst_ptr); -#if defined(HAS_SCALEADDROWS_SSE2) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && - (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) && - (src_width % 16) == 0) { - ScaleAddRows = ScaleAddRows_SSE2; - } else -#endif - { - ScaleAddRows = ScaleAddRows_C; - } if (dx & 0xffff) { ScaleAddCols = ScaleAddCols2_C; } else { ScaleAddCols = ScaleAddCols1_C; } +#if defined(HAS_SCALEADDROWS_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) { + ScaleAddRows = ScaleAddRows_SSE2; + } +#endif - int y = 0; for (int j = 0; j < dst_height; ++j) { int iy = y >> 16; - const uint8* const src = src_ptr + iy * src_stride; + const uint8* src = src_ptr + iy * src_stride; y += dy; if (y > (src_height << 16)) { y = (src_height << 16); } int boxheight = (y >> 16) - iy; ScaleAddRows(src, src_stride, row, src_width, boxheight); - ScaleAddCols(dst_width, boxheight, dx, row, dst_ptr); + ScaleAddCols(dst_width, boxheight, x, dx, row, dst_ptr); dst_ptr += dst_stride; } } @@ -3122,33 +2683,34 @@ static void ScalePlaneBilinearSimple(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8* src_ptr, uint8* dst_ptr) { - uint8* dst = dst_ptr; int dx = (src_width << 16) / dst_width; int dy = (src_height << 16) / dst_height; - int maxx = ((src_width - 1) << 16) - 1; - int maxy = ((src_height - 1) << 16) - 1; - int y = (dst_height < src_height) ? 32768 : - (src_height << 16) / dst_height - 32768; + int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1); + int maxx = (src_width > 1) ? ((src_width - 1) << 16) - 1 : 0; + int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; for (int i = 0; i < dst_height; ++i) { - int cy = (y < 0) ? 0 : y; - int yi = cy >> 16; - int yf = cy & 0xffff; - const uint8* const src = src_ptr + yi * src_stride; - int x = (dst_width < src_width) ? 32768 : - (src_width << 16) / dst_width - 32768; + int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1); + int yi = y >> 16; + int yf = y & 0xffff; + const uint8* src0 = src_ptr + yi * src_stride; + const uint8* src1 = (yi < src_height - 1) ? src0 + src_stride : src0; + uint8* dst = dst_ptr; for (int j = 0; j < dst_width; ++j) { - int cx = (x < 0) ? 0 : x; - int xi = cx >> 16; - int xf = cx & 0xffff; - int r0 = (src[xi] * (65536 - xf) + src[xi + 1] * xf) >> 16; - int r1 = (src[xi + src_stride] * (65536 - xf) + - src[xi + src_stride + 1] * xf) >> 16; - *dst++ = (r0 * (65536 - yf) + r1 * yf) >> 16; + int xi = x >> 16; + int xf = x & 0xffff; + int x1 = (xi < src_width - 1) ? xi + 1 : xi; + int a = src0[xi]; + int b = src0[x1]; + int r0 = BLENDER(a, b, xf); + a = src1[xi]; + b = src1[x1]; + int r1 = BLENDER(a, b, xf); + *dst++ = BLENDER(r0, r1, yf); x += dx; if (x > maxx) x = maxx; } - dst += dst_stride - dst_width; + dst_ptr += dst_stride; y += dy; if (y > maxy) y = maxy; @@ -3159,52 +2721,51 @@ static void ScalePlaneBilinearSimple(int src_width, int src_height, * Scale plane to/from any dimensions, with bilinear * interpolation. */ -static void ScalePlaneBilinear(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr) { +void ScalePlaneBilinear(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr) { assert(dst_width > 0); assert(dst_height > 0); - int dy = (src_height << 16) / dst_height; - int dx = (src_width << 16) / dst_width; - if ((src_width % 8 != 0) || (src_width > kMaxInputWidth)) { + if (!IS_ALIGNED(src_width, 8) || (src_width > kMaxInputWidth)) { ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src_ptr, dst_ptr); } else { - ALIGN16(uint8 row[kMaxInputWidth + 1]); + SIMD_ALIGNED(uint8 row[kMaxInputWidth + 16]); void (*ScaleFilterRows)(uint8* dst_ptr, const uint8* src_ptr, - int src_stride, - int dst_width, int source_y_fraction); - void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int dx); -#if defined(HAS_SCALEFILTERROWS_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && - (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) && - (src_width % 16) == 0) { - ScaleFilterRows = ScaleFilterRows_SSSE3; - } else + ptrdiff_t src_stride, + int dst_width, int source_y_fraction) = + ScaleFilterRows_C; +#if defined(HAS_SCALEFILTERROWS_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleFilterRows = ScaleFilterRows_NEON; + } #endif #if defined(HAS_SCALEFILTERROWS_SSE2) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && - (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) && - (src_width % 16) == 0) { + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) { ScaleFilterRows = ScaleFilterRows_SSE2; - } else + } #endif - { - ScaleFilterRows = ScaleFilterRows_C; +#if defined(HAS_SCALEFILTERROWS_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) { + ScaleFilterRows = ScaleFilterRows_SSSE3; } - ScaleFilterCols = ScaleFilterCols_C; +#endif - int y = 0; - int maxy = ((src_height - 1) << 16) - 1; // max is filter of last 2 rows. + int dx = (src_width << 16) / dst_width; + int dy = (src_height << 16) / dst_height; + int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1); + int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1); + int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; for (int j = 0; j < dst_height; ++j) { - int iy = y >> 16; - int fy = (y >> 8) & 255; - const uint8* const src = src_ptr + iy * src_stride; - ScaleFilterRows(row, src, src_stride, src_width, fy); - ScaleFilterCols(dst_ptr, row, dst_width, dx); + int yi = y >> 16; + int yf = (y >> 8) & 255; + const uint8* src = src_ptr + yi * src_stride; + ScaleFilterRows(row, src, src_stride, src_width, yf); + ScaleFilterCols_C(dst_ptr, row, dst_width, x, dx); dst_ptr += dst_stride; y += dy; if (y > maxy) { @@ -3224,18 +2785,20 @@ static void ScalePlaneSimple(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8* src_ptr, uint8* dst_ptr) { - uint8* dst = dst_ptr; int dx = (src_width << 16) / dst_width; - for (int y = 0; y < dst_height; ++y) { - const uint8* const src = src_ptr + (y * src_height / dst_height) * - src_stride; - // TODO(fbarchard): Round X coordinate by setting x=0x8000. - int x = 0; + int dy = (src_height << 16) / dst_height; + int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1); + for (int j = 0; j < dst_height; ++j) { + int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1); + int yi = y >> 16; + const uint8* src = src_ptr + yi * src_stride; + uint8* dst = dst_ptr; for (int i = 0; i < dst_width; ++i) { *dst++ = src[x >> 16]; x += dx; } - dst += dst_stride - dst_width; + dst_ptr += dst_stride; + y += dy; } } @@ -3283,47 +2846,31 @@ static void ScalePlaneDown(int src_width, int src_height, } } -/** - * Copy plane, no scaling - * - * This simply copies the given plane without scaling. - * The current implementation is ~115 times faster - * compared to the reference implementation. - * - */ -static void CopyPlane(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr) { - if (src_stride == src_width && dst_stride == dst_width) { - // All contiguous, so can use REALLY fast path. - memcpy(dst_ptr, src_ptr, src_width * src_height); - } else { - // Not all contiguous; must copy scanlines individually - const uint8* src = src_ptr; - uint8* dst = dst_ptr; - for (int i = 0; i < src_height; ++i) { - memcpy(dst, src, src_width); - dst += dst_stride; - src += src_stride; - } +// Scale a plane. +// This function in turn calls a scaling function suitable for handling +// the desired resolutions. + +LIBYUV_API +void ScalePlane(const uint8* src, int src_stride, + int src_width, int src_height, + uint8* dst, int dst_stride, + int dst_width, int dst_height, + FilterMode filtering) { +#ifdef CPU_X86 + // environment variable overrides for testing. + char *filter_override = getenv("LIBYUV_FILTER"); + if (filter_override) { + filtering = (FilterMode)atoi(filter_override); // NOLINT } -} - -static void ScalePlane(const uint8* src, int src_stride, - int src_width, int src_height, - uint8* dst, int dst_stride, - int dst_width, int dst_height, - FilterMode filtering, bool use_ref) { +#endif // Use specialized scales to improve performance for common resolutions. // For example, all the 1/2 scalings will use ScalePlaneDown2() if (dst_width == src_width && dst_height == src_height) { // Straight copy. - CopyPlane(src_width, src_height, dst_width, dst_height, src_stride, - dst_stride, src, dst); + CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height); } else if (dst_width <= src_width && dst_height <= src_height) { // Scale down. - if (use_ref) { + if (use_reference_impl_) { // For testing, allow the optimized versions to be disabled. ScalePlaneDown(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); @@ -3342,11 +2889,13 @@ static void ScalePlane(const uint8* src, int src_stride, // optimized, 3/8 ScalePlaneDown38(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); - } else if (4 * dst_width == src_width && 4 * dst_height == src_height) { + } else if (4 * dst_width == src_width && 4 * dst_height == src_height && + filtering != kFilterBilinear) { // optimized, 1/4 ScalePlaneDown4(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); - } else if (8 * dst_width == src_width && 8 * dst_height == src_height) { + } else if (8 * dst_width == src_width && 8 * dst_height == src_height && + filtering != kFilterBilinear) { // optimized, 1/8 ScalePlaneDown8(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); @@ -3362,14 +2911,12 @@ static void ScalePlane(const uint8* src, int src_stride, } } -/** - * Scale a plane. - * - * This function in turn calls a scaling function - * suitable for handling the desired resolutions. - * - */ +// Scale an I420 image. +// This function in turn calls a scaling function for each plane. + +#define UNDER_ALLOCATED_HACK 1 +LIBYUV_API int I420Scale(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, @@ -3394,23 +2941,47 @@ int I420Scale(const uint8* src_y, int src_stride_y, src_stride_u = -src_stride_u; src_stride_v = -src_stride_v; } - int halfsrc_width = (src_width + 1) >> 1; - int halfsrc_height = (src_height + 1) >> 1; - int halfdst_width = (dst_width + 1) >> 1; - int halfoheight = (dst_height + 1) >> 1; + int src_halfwidth = (src_width + 1) >> 1; + int src_halfheight = (src_height + 1) >> 1; + int dst_halfwidth = (dst_width + 1) >> 1; + int dst_halfheight = (dst_height + 1) >> 1; + +#ifdef UNDER_ALLOCATED_HACK + // If caller passed width / 2 for stride, adjust halfwidth to match. + if ((src_width & 1) && src_stride_u && src_halfwidth > abs(src_stride_u)) { + src_halfwidth = src_width >> 1; + } + if ((dst_width & 1) && dst_stride_u && dst_halfwidth > abs(dst_stride_u)) { + dst_halfwidth = dst_width >> 1; + } + // If caller used height / 2 when computing src_v, it will point into what + // should be the src_u plane. Detect this and reduce halfheight to match. + int uv_src_plane_size = src_halfwidth * src_halfheight; + if ((src_height & 1) && + (src_v > src_u) && (src_v < (src_u + uv_src_plane_size))) { + src_halfheight = src_height >> 1; + } + int uv_dst_plane_size = dst_halfwidth * dst_halfheight; + if ((dst_height & 1) && + (dst_v > dst_u) && (dst_v < (dst_u + uv_dst_plane_size))) { + dst_halfheight = dst_height >> 1; + } +#endif ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, dst_width, dst_height, - filtering, use_reference_impl_); - ScalePlane(src_u, src_stride_u, halfsrc_width, halfsrc_height, - dst_u, dst_stride_u, halfdst_width, halfoheight, - filtering, use_reference_impl_); - ScalePlane(src_v, src_stride_v, halfsrc_width, halfsrc_height, - dst_v, dst_stride_v, halfdst_width, halfoheight, - filtering, use_reference_impl_); + filtering); + ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, + dst_u, dst_stride_u, dst_halfwidth, dst_halfheight, + filtering); + ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, + dst_v, dst_stride_v, dst_halfwidth, dst_halfheight, + filtering); return 0; } +// Deprecated api +LIBYUV_API int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v, int src_stride_y, int src_stride_u, int src_stride_v, int src_width, int src_height, @@ -3433,49 +3004,77 @@ int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v, src_stride_u = -src_stride_u; src_stride_v = -src_stride_v; } - int halfsrc_width = (src_width + 1) >> 1; - int halfsrc_height = (src_height + 1) >> 1; - int halfdst_width = (dst_width + 1) >> 1; - int halfoheight = (dst_height + 1) >> 1; + int src_halfwidth = (src_width + 1) >> 1; + int src_halfheight = (src_height + 1) >> 1; + int dst_halfwidth = (dst_width + 1) >> 1; + int dst_halfheight = (dst_height + 1) >> 1; FilterMode filtering = interpolate ? kFilterBox : kFilterNone; +#ifdef UNDER_ALLOCATED_HACK + // If caller passed width / 2 for stride, adjust halfwidth to match. + if ((src_width & 1) && src_stride_u && src_halfwidth > abs(src_stride_u)) { + src_halfwidth = src_width >> 1; + } + if ((dst_width & 1) && dst_stride_u && dst_halfwidth > abs(dst_stride_u)) { + dst_halfwidth = dst_width >> 1; + } + // If caller used height / 2 when computing src_v, it will point into what + // should be the src_u plane. Detect this and reduce halfheight to match. + int uv_src_plane_size = src_halfwidth * src_halfheight; + if ((src_height & 1) && + (src_v > src_u) && (src_v < (src_u + uv_src_plane_size))) { + src_halfheight = src_height >> 1; + } + int uv_dst_plane_size = dst_halfwidth * dst_halfheight; + if ((dst_height & 1) && + (dst_v > dst_u) && (dst_v < (dst_u + uv_dst_plane_size))) { + dst_halfheight = dst_height >> 1; + } +#endif + ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, dst_width, dst_height, - filtering, use_reference_impl_); - ScalePlane(src_u, src_stride_u, halfsrc_width, halfsrc_height, - dst_u, dst_stride_u, halfdst_width, halfoheight, - filtering, use_reference_impl_); - ScalePlane(src_v, src_stride_v, halfsrc_width, halfsrc_height, - dst_v, dst_stride_v, halfdst_width, halfoheight, - filtering, use_reference_impl_); + filtering); + ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, + dst_u, dst_stride_u, dst_halfwidth, dst_halfheight, + filtering); + ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, + dst_v, dst_stride_v, dst_halfwidth, dst_halfheight, + filtering); return 0; } -int Scale(const uint8* src, int src_width, int src_height, - uint8* dst, int dst_width, int dst_height, int ooffset, - bool interpolate) { +// Deprecated api +LIBYUV_API +int ScaleOffset(const uint8* src, int src_width, int src_height, + uint8* dst, int dst_width, int dst_height, int dst_yoffset, + bool interpolate) { if (!src || src_width <= 0 || src_height <= 0 || - !dst || dst_width <= 0 || dst_height <= 0 || ooffset < 0 || - ooffset >= dst_height) { + !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset < 0 || + dst_yoffset >= dst_height) { return -1; } - ooffset = ooffset & ~1; // chroma requires offset to multiple of 2. - int halfsrc_width = (src_width + 1) >> 1; - int halfsrc_height = (src_height + 1) >> 1; - int halfdst_width = (dst_width + 1) >> 1; - int halfoheight = (dst_height + 1) >> 1; - int aheight = dst_height - ooffset * 2; // actual output height - const uint8* const iyptr = src; - uint8* oyptr = dst + ooffset * dst_width; - const uint8* const iuptr = src + src_width * src_height; - uint8* ouptr = dst + dst_width * dst_height + (ooffset >> 1) * halfdst_width; - const uint8* const ivptr = src + src_width * src_height + - halfsrc_width * halfsrc_height; - uint8* ovptr = dst + dst_width * dst_height + halfdst_width * halfoheight + - (ooffset >> 1) * halfdst_width; - return Scale(iyptr, iuptr, ivptr, src_width, halfsrc_width, halfsrc_width, - src_width, src_height, oyptr, ouptr, ovptr, dst_width, - halfdst_width, halfdst_width, dst_width, aheight, interpolate); -} - + dst_yoffset = dst_yoffset & ~1; // chroma requires offset to multiple of 2. + int src_halfwidth = (src_width + 1) >> 1; + int src_halfheight = (src_height + 1) >> 1; + int dst_halfwidth = (dst_width + 1) >> 1; + int dst_halfheight = (dst_height + 1) >> 1; + int aheight = dst_height - dst_yoffset * 2; // actual output height + const uint8* src_y = src; + const uint8* src_u = src + src_width * src_height; + const uint8* src_v = src + src_width * src_height + + src_halfwidth * src_halfheight; + uint8* dst_y = dst + dst_yoffset * dst_width; + uint8* dst_u = dst + dst_width * dst_height + + (dst_yoffset >> 1) * dst_halfwidth; + uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight + + (dst_yoffset >> 1) * dst_halfwidth; + return Scale(src_y, src_u, src_v, src_width, src_halfwidth, src_halfwidth, + src_width, src_height, dst_y, dst_u, dst_v, dst_width, + dst_halfwidth, dst_halfwidth, dst_width, aheight, interpolate); +} + +#ifdef __cplusplus +} // extern "C" } // namespace libyuv +#endif diff --git a/files/source/scale_argb.cc b/files/source/scale_argb.cc new file mode 100644 index 00000000..5d4e1ac0 --- /dev/null +++ b/files/source/scale_argb.cc @@ -0,0 +1,1035 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/scale.h" + +#include <assert.h> +#include <string.h> +#include <stdlib.h> // For getenv() + +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" // For CopyARGB +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Bilinear SSE2 is disabled. +#define SSE2_DISABLED 1 + +// ARGB scaling uses bilinear or point, but not box filter. +/** + * SSE2 downscalers with bilinear interpolation. + */ + +#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) + +#define HAS_SCALEARGBROWDOWN2_SSE2 +// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) __declspec(align(16)) +static void ScaleARGBRowDown2_SSE2(const uint8* src_ptr, + ptrdiff_t /* src_stride */, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + + align 16 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + shufps xmm0, xmm1, 0x88 + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + ret + } +} + +// Blends 8x2 rectangle to 4x1. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) __declspec(align(16)) +static void ScaleARGBRowDown2Int_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + + align 16 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + esi] + movdqa xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + movdqa xmm2, xmm0 // average columns (8 to 4 pixels) + shufps xmm0, xmm1, 0x88 // even pixels + shufps xmm2, xmm1, 0xdd // odd pixels + pavgb xmm0, xmm2 + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + pop esi + ret + } +} + +#define HAS_SCALEARGBROWDOWNEVEN_SSE2 +// Reads 4 pixels at a time. +// Alignment requirement: dst_ptr 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_ptr, int dst_width) { + __asm { + push ebx + push edi + mov eax, [esp + 8 + 4] // src_ptr + // src_stride ignored + mov ebx, [esp + 8 + 12] // src_stepx + mov edx, [esp + 8 + 16] // dst_ptr + mov ecx, [esp + 8 + 20] // dst_width + lea ebx, [ebx * 4] + lea edi, [ebx + ebx * 2] + + align 16 + wloop: + movd xmm0, [eax] + movd xmm1, [eax + ebx] + punpckldq xmm0, xmm1 + movd xmm2, [eax + ebx * 2] + movd xmm3, [eax + edi] + lea eax, [eax + ebx * 4] + punpckldq xmm2, xmm3 + punpcklqdq xmm0, xmm2 + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + pop edi + pop ebx + ret + } +} + +// Blends four 2x2 to 4x1. +// Alignment requirement: dst_ptr 16 byte aligned. +__declspec(naked) __declspec(align(16)) +static void ScaleARGBRowDownEvenInt_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_ptr, int dst_width) { + __asm { + push ebx + push esi + push edi + mov eax, [esp + 12 + 4] // src_ptr + mov esi, [esp + 12 + 8] // src_stride + mov ebx, [esp + 12 + 12] // src_stepx + mov edx, [esp + 12 + 16] // dst_ptr + mov ecx, [esp + 12 + 20] // dst_width + lea esi, [eax + esi] // row1 pointer + lea ebx, [ebx * 4] + lea edi, [ebx + ebx * 2] + + align 16 + wloop: + movq xmm0, qword ptr [eax] // row0 4 pairs + movhps xmm0, qword ptr [eax + ebx] + movq xmm1, qword ptr [eax + ebx * 2] + movhps xmm1, qword ptr [eax + edi] + lea eax, [eax + ebx * 4] + movq xmm2, qword ptr [esi] // row1 4 pairs + movhps xmm2, qword ptr [esi + ebx] + movq xmm3, qword ptr [esi + ebx * 2] + movhps xmm3, qword ptr [esi + edi] + lea esi, [esi + ebx * 4] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + movdqa xmm2, xmm0 // average columns (8 to 4 pixels) + shufps xmm0, xmm1, 0x88 // even pixels + shufps xmm2, xmm1, 0xdd // odd pixels + pavgb xmm0, xmm2 + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + pop edi + pop esi + pop ebx + ret + } +} + +// Bilinear row filtering combines 4x2 -> 4x1. SSE2 version. +#ifndef SSE2_DISABLED +#define HAS_SCALEARGBFILTERROWS_SSE2_DISABLED +__declspec(naked) __declspec(align(16)) +void ScaleARGBFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr + mov edx, [esp + 8 + 12] // src_stride + mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + sub edi, esi + cmp eax, 0 + je xloop1 + cmp eax, 128 + je xloop2 + + movd xmm5, eax // xmm5 = y fraction + punpcklbw xmm5, xmm5 + punpcklwd xmm5, xmm5 + pshufd xmm5, xmm5, 0 + pxor xmm4, xmm4 + + // f * row1 + (1 - frac) row0 + // frac * (row1 - row0) + row0 + align 16 + xloop: + movdqa xmm0, [esi] // row0 + movdqa xmm2, [esi + edx] // row1 + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm4 + punpckhbw xmm3, xmm4 + punpcklbw xmm0, xmm4 + punpckhbw xmm1, xmm4 + psubw xmm2, xmm0 // row1 - row0 + psubw xmm3, xmm1 + pmulhw xmm2, xmm5 // scale diff + pmulhw xmm3, xmm5 + paddw xmm0, xmm2 // sum rows + paddw xmm1, xmm3 + packuswb xmm0, xmm1 + sub ecx, 4 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop + + shufps xmm0, xmm0, 0xff + movdqa [esi + edi], xmm0 // duplicate last pixel for filtering + pop edi + pop esi + ret + + align 16 + xloop1: + movdqa xmm0, [esi] + sub ecx, 4 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop1 + + shufps xmm0, xmm0, 0xff + movdqa [esi + edi], xmm0 + pop edi + pop esi + ret + + align 16 + xloop2: + movdqa xmm0, [esi] + pavgb xmm0, [esi + edx] + sub ecx, 4 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop2 + + shufps xmm0, xmm0, 0xff + movdqa [esi + edi], xmm0 + pop edi + pop esi + ret + } +} +#endif // SSE2_DISABLED + +// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version. +#define HAS_SCALEARGBFILTERROWS_SSSE3 +__declspec(naked) __declspec(align(16)) +void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr + mov edx, [esp + 8 + 12] // src_stride + mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + sub edi, esi + shr eax, 1 + cmp eax, 0 + je xloop1 + cmp eax, 64 + je xloop2 + movd xmm0, eax // high fraction 0..127 + neg eax + add eax, 128 + movd xmm5, eax // low fraction 128..1 + punpcklbw xmm5, xmm0 + punpcklwd xmm5, xmm5 + pshufd xmm5, xmm5, 0 + + align 16 + xloop: + movdqa xmm0, [esi] + movdqa xmm2, [esi + edx] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm2 + punpckhbw xmm1, xmm2 + pmaddubsw xmm0, xmm5 + pmaddubsw xmm1, xmm5 + psrlw xmm0, 7 + psrlw xmm1, 7 + packuswb xmm0, xmm1 + sub ecx, 4 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop + + shufps xmm0, xmm0, 0xff + movdqa [esi + edi], xmm0 // duplicate last pixel for filtering + pop edi + pop esi + ret + + align 16 + xloop1: + movdqa xmm0, [esi] + sub ecx, 4 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop1 + + shufps xmm0, xmm0, 0xff + movdqa [esi + edi], xmm0 + pop edi + pop esi + ret + + align 16 + xloop2: + movdqa xmm0, [esi] + pavgb xmm0, [esi + edx] + sub ecx, 4 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop2 + + shufps xmm0, xmm0, 0xff + movdqa [esi + edi], xmm0 + pop edi + pop esi + ret + } +} + +#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__)) + +// GCC versions of row functions are verbatim conversions from Visual C. +// Generated using gcc disassembly on Visual C object file: +// objdump -D yuvscaler.obj >yuvscaler.txt +#define HAS_SCALEARGBROWDOWN2_SSE2 +static void ScaleARGBRowDown2_SSE2(const uint8* src_ptr, + ptrdiff_t /* src_stride */, + uint8* dst_ptr, int dst_width) { + asm volatile ( + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} + +static void ScaleARGBRowDown2Int_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa (%0,%3,1),%%xmm2 \n" + "movdqa 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(static_cast<intptr_t>(src_stride)) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3" +#endif + ); +} + +#define HAS_SCALEARGBROWDOWNEVEN_SSE2 +// Reads 4 pixels at a time. +// Alignment requirement: dst_ptr 16 byte aligned. +void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_ptr, int dst_width) { + intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx); + intptr_t src_stepx_x12 = 0; + asm volatile ( + "lea 0x0(,%1,4),%1 \n" + "lea (%1,%1,2),%4 \n" + ".p2align 4 \n" + "1: \n" + "movd (%0),%%xmm0 \n" + "movd (%0,%1,1),%%xmm1 \n" + "punpckldq %%xmm1,%%xmm0 \n" + "movd (%0,%1,2),%%xmm2 \n" + "movd (%0,%4,1),%%xmm3 \n" + "lea (%0,%1,4),%0 \n" + "punpckldq %%xmm3,%%xmm2 \n" + "punpcklqdq %%xmm2,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqa %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stepx_x4), // %1 + "+r"(dst_ptr), // %2 + "+r"(dst_width), // %3 + "+r"(src_stepx_x12) // %4 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3" +#endif + ); +} + +// Blends four 2x2 to 4x1. +// Alignment requirement: dst_ptr 16 byte aligned. +static void ScaleARGBRowDownEvenInt_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, int src_stepx, + uint8* dst_ptr, int dst_width) { + intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx); + intptr_t src_stepx_x12 = 0; + intptr_t row1 = static_cast<intptr_t>(src_stride); + asm volatile ( + "lea 0x0(,%1,4),%1 \n" + "lea (%1,%1,2),%4 \n" + "lea (%0,%5,1),%5 \n" + ".p2align 4 \n" + "1: \n" + "movq (%0),%%xmm0 \n" + "movhps (%0,%1,1),%%xmm0 \n" + "movq (%0,%1,2),%%xmm1 \n" + "movhps (%0,%4,1),%%xmm1 \n" + "lea (%0,%1,4),%0 \n" + "movq (%5),%%xmm2 \n" + "movhps (%5,%1,1),%%xmm2 \n" + "movq (%5,%1,2),%%xmm3 \n" + "movhps (%5,%4,1),%%xmm3 \n" + "lea (%5,%1,4),%5 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqa %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stepx_x4), // %1 + "+r"(dst_ptr), // %2 + "+rm"(dst_width), // %3 + "+r"(src_stepx_x12), // %4 + "+r"(row1) // %5 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3" +#endif + ); +} + +#ifndef SSE2_DISABLED +// Bilinear row filtering combines 4x2 -> 4x1. SSE2 version +#define HAS_SCALEARGBFILTERROWS_SSE2_DISABLED +void ScaleARGBFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + asm volatile ( + "sub %1,%0 \n" + "cmp $0x0,%3 \n" + "je 2f \n" + "cmp $0x80,%3 \n" + "je 3f \n" + "movd %3,%%xmm5 \n" + "punpcklbw %%xmm5,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%1),%%xmm0 \n" + "movdqa (%1,%4,1),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm4,%%xmm2 \n" + "punpckhbw %%xmm4,%%xmm3 \n" + "punpcklbw %%xmm4,%%xmm0 \n" + "punpckhbw %%xmm4,%%xmm1 \n" + "psubw %%xmm0,%%xmm2 \n" + "psubw %%xmm1,%%xmm3 \n" + "pmulhw %%xmm5,%%xmm2 \n" + "pmulhw %%xmm5,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0,(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "jg 1b \n" + "jmp 4f \n" + ".p2align 4 \n" + "2: \n" + "movdqa (%1),%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0,(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "jg 2b \n" + "jmp 4f \n" + ".p2align 4 \n" + "3: \n" + "movdqa (%1),%%xmm0 \n" + "pavgb (%1,%4,1),%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0,(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "lea 0x10(%1),%1 \n" + "jg 3b \n" + ".p2align 4 \n" + "4: \n" + "shufps $0xff,%%xmm0,%%xmm0 \n" + "movdqa %%xmm0,(%1,%0,1) \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(source_y_fraction) // %3 + : "r"(static_cast<intptr_t>(src_stride)) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // SSE2_DISABLED + +// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version +#define HAS_SCALEARGBFILTERROWS_SSSE3 +void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + asm volatile ( + "sub %1,%0 \n" + "shr %3 \n" + "cmp $0x0,%3 \n" + "je 2f \n" + "cmp $0x40,%3 \n" + "je 3f \n" + "movd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x80,%3 \n" + "movd %3,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%1),%%xmm0 \n" + "movdqa (%1,%4,1),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "pmaddubsw %%xmm5,%%xmm0 \n" + "pmaddubsw %%xmm5,%%xmm1 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0,(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "jg 1b \n" + "jmp 4f \n" + ".p2align 4 \n" + "2: \n" + "movdqa (%1),%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0,(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "jg 2b \n" + "jmp 4f \n" + ".p2align 4 \n" + "3: \n" + "movdqa (%1),%%xmm0 \n" + "pavgb (%1,%4,1),%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0,(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "jg 3b \n" + "4: \n" + ".p2align 4 \n" + "shufps $0xff,%%xmm0,%%xmm0 \n" + "movdqa %%xmm0,(%1,%0,1) \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(source_y_fraction) // %3 + : "r"(static_cast<intptr_t>(src_stride)) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm5" +#endif + ); +} +#endif // defined(__x86_64__) || defined(__i386__) + +static void ScaleARGBRowDown2_C(const uint8* src_ptr, + ptrdiff_t /* src_stride */, + uint8* dst_ptr, int dst_width) { + const uint32* src = reinterpret_cast<const uint32*>(src_ptr); + uint32* dst = reinterpret_cast<uint32*>(dst_ptr); + + for (int x = 0; x < dst_width - 1; x += 2) { + dst[0] = src[0]; + dst[1] = src[2]; + src += 4; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[0]; + } +} + +static void ScaleARGBRowDown2Int_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + for (int x = 0; x < dst_width; ++x) { + dst_ptr[0] = (src_ptr[0] + src_ptr[4] + + src_ptr[src_stride] + src_ptr[src_stride + 4] + 2) >> 2; + dst_ptr[1] = (src_ptr[1] + src_ptr[5] + + src_ptr[src_stride + 1] + src_ptr[src_stride + 5] + 2) >> 2; + dst_ptr[2] = (src_ptr[2] + src_ptr[6] + + src_ptr[src_stride + 2] + src_ptr[src_stride + 6] + 2) >> 2; + dst_ptr[3] = (src_ptr[3] + src_ptr[7] + + src_ptr[src_stride + 3] + src_ptr[src_stride + 7] + 2) >> 2; + src_ptr += 8; + dst_ptr += 4; + } +} + +void ScaleARGBRowDownEven_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, + int src_stepx, + uint8* dst_ptr, int dst_width) { + const uint32* src = reinterpret_cast<const uint32*>(src_ptr); + uint32* dst = reinterpret_cast<uint32*>(dst_ptr); + + for (int x = 0; x < dst_width - 1; x += 2) { + dst[0] = src[0]; + dst[1] = src[src_stepx]; + src += src_stepx * 2; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[0]; + } +} + +static void ScaleARGBRowDownEvenInt_C(const uint8* src_ptr, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_ptr, int dst_width) { + for (int x = 0; x < dst_width; ++x) { + dst_ptr[0] = (src_ptr[0] + src_ptr[4] + + src_ptr[src_stride] + src_ptr[src_stride + 4] + 2) >> 2; + dst_ptr[1] = (src_ptr[1] + src_ptr[5] + + src_ptr[src_stride + 1] + src_ptr[src_stride + 5] + 2) >> 2; + dst_ptr[2] = (src_ptr[2] + src_ptr[6] + + src_ptr[src_stride + 2] + src_ptr[src_stride + 6] + 2) >> 2; + dst_ptr[3] = (src_ptr[3] + src_ptr[7] + + src_ptr[src_stride + 3] + src_ptr[src_stride + 7] + 2) >> 2; + src_ptr += src_stepx * 4; + dst_ptr += 4; + } +} + +// (1-f)a + fb can be replaced with a + f(b-a) + +#define BLENDER1(a, b, f) (static_cast<int>(a) + \ + ((f) * (static_cast<int>(b) - static_cast<int>(a)) >> 16)) + +#define BLENDERC(a, b, f, s) static_cast<uint32>( \ + BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s) + +#define BLENDER(a, b, f) \ + BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \ + BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0) + +static void ScaleARGBFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + const uint32* src = reinterpret_cast<const uint32*>(src_ptr); + uint32* dst = reinterpret_cast<uint32*>(dst_ptr); + for (int j = 0; j < dst_width - 1; j += 2) { + int xi = x >> 16; + uint32 a = src[xi]; + uint32 b = src[xi + 1]; + dst[0] = BLENDER(a, b, x & 0xffff); + x += dx; + xi = x >> 16; + a = src[xi]; + b = src[xi + 1]; + dst[1] = BLENDER(a, b, x & 0xffff); + x += dx; + dst += 2; + } + if (dst_width & 1) { + int xi = x >> 16; + uint32 a = src[xi]; + uint32 b = src[xi + 1]; + dst[0] = BLENDER(a, b, x & 0xffff); + } +} + +static const int kMaxInputWidth = 2560; + +// C version 2x2 -> 2x1 +void ScaleARGBFilterRows_C(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, + int dst_width, int source_y_fraction) { + assert(dst_width > 0); + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint8* src_ptr1 = src_ptr + src_stride; + uint8* end = dst_ptr + (dst_width << 2); + do { + dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; + dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8; + dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8; + dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8; + dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8; + dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8; + dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8; + dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8; + src_ptr += 8; + src_ptr1 += 8; + dst_ptr += 8; + } while (dst_ptr < end); + // Duplicate the last pixel (4 bytes) for filtering. + dst_ptr[0] = dst_ptr[-4]; + dst_ptr[1] = dst_ptr[-3]; + dst_ptr[2] = dst_ptr[-2]; + dst_ptr[3] = dst_ptr[-1]; +} + +/** + * ScaleARGB ARGB, 1/2 + * + * This is an optimized version for scaling down a ARGB to 1/2 of + * its original size. + * + */ +static void ScaleARGBDown2(int /* src_width */, int /* src_height */, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { + void (*ScaleARGBRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) = + filtering ? ScaleARGBRowDown2Int_C : ScaleARGBRowDown2_C; +#if defined(HAS_SCALEARGBROWDOWN2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(dst_width, 4) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Int_SSE2 : + ScaleARGBRowDown2_SSE2; + } +#endif + + // TODO(fbarchard): Loop through source height to allow odd height. + for (int y = 0; y < dst_height; ++y) { + ScaleARGBRowDown2(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += (src_stride << 1); + dst_ptr += dst_stride; + } +} + +/** + * ScaleARGB ARGB Even + * + * This is an optimized version for scaling down a ARGB to even + * multiple of its original size. + * + */ +static void ScaleARGBDownEven(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { + assert(IS_ALIGNED(src_width, 2)); + assert(IS_ALIGNED(src_height, 2)); + void (*ScaleARGBRowDownEven)(const uint8* src_ptr, ptrdiff_t src_stride, + int src_step, uint8* dst_ptr, int dst_width) = + filtering ? ScaleARGBRowDownEvenInt_C : ScaleARGBRowDownEven_C; +#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(dst_width, 4) && + IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenInt_SSE2 : + ScaleARGBRowDownEven_SSE2; + } +#endif + int src_step = src_width / dst_width; + // Adjust to point to center of box. + int row_step = src_height / dst_height; + int row_stride = row_step * src_stride; + src_ptr += ((row_step >> 1) - 1) * src_stride + ((src_step >> 1) - 1) * 4; + for (int y = 0; y < dst_height; ++y) { + ScaleARGBRowDownEven(src_ptr, src_stride, src_step, dst_ptr, dst_width); + src_ptr += row_stride; + dst_ptr += dst_stride; + } +} +/** + * ScaleARGB ARGB to/from any dimensions, with bilinear + * interpolation. + */ + +static void ScaleARGBBilinear(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr) { + assert(dst_width > 0); + assert(dst_height > 0); + assert(src_width <= kMaxInputWidth); + SIMD_ALIGNED(uint8 row[kMaxInputWidth * 4 + 16]); + void (*ScaleARGBFilterRows)(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, + int dst_width, int source_y_fraction) = + ScaleARGBFilterRows_C; +#if defined(HAS_SCALEARGBFILTERROWS_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) { + ScaleARGBFilterRows = ScaleARGBFilterRows_SSE2; + } +#endif +#if defined(HAS_SCALEARGBFILTERROWS_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) { + ScaleARGBFilterRows = ScaleARGBFilterRows_SSSE3; + } +#endif + int dx = (src_width << 16) / dst_width; + int dy = (src_height << 16) / dst_height; + int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1); + int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1); + int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; + for (int j = 0; j < dst_height; ++j) { + int yi = y >> 16; + int yf = (y >> 8) & 255; + const uint8* src = src_ptr + yi * src_stride; + ScaleARGBFilterRows(row, src, src_stride, src_width, yf); + ScaleARGBFilterCols_C(dst_ptr, row, dst_width, x, dx); + dst_ptr += dst_stride; + y += dy; + if (y > maxy) { + y = maxy; + } + } +} + +// Scales a single row of pixels using point sampling. +// Code is adapted from libyuv bilinear yuv scaling, but with bilinear +// interpolation off, and argb pixels instead of yuv. +static void ScaleARGBCols(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + const uint32* src = reinterpret_cast<const uint32*>(src_ptr); + uint32* dst = reinterpret_cast<uint32*>(dst_ptr); + for (int j = 0; j < dst_width - 1; j += 2) { + dst[0] = src[x >> 16]; + x += dx; + dst[1] = src[x >> 16]; + x += dx; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[x >> 16]; + } +} + +/** + * ScaleARGB ARGB to/from any dimensions, without interpolation. + * Fixed point math is used for performance: The upper 16 bits + * of x and dx is the integer part of the source position and + * the lower 16 bits are the fixed decimal part. + */ + +static void ScaleARGBSimple(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr) { + int dx = (src_width << 16) / dst_width; + int dy = (src_height << 16) / dst_height; + int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1); + int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1); + for (int i = 0; i < dst_height; ++i) { + ScaleARGBCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx); + dst_ptr += dst_stride; + y += dy; + } +} + +/** + * ScaleARGB ARGB to/from any dimensions. + */ +static void ScaleARGBAnySize(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { + if (!filtering || (src_width > kMaxInputWidth)) { + ScaleARGBSimple(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src_ptr, dst_ptr); + } else { + ScaleARGBBilinear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src_ptr, dst_ptr); + } +} + +// ScaleARGB a ARGB. +// +// This function in turn calls a scaling function +// suitable for handling the desired resolutions. + +static void ScaleARGB(const uint8* src, int src_stride, + int src_width, int src_height, + uint8* dst, int dst_stride, + int dst_width, int dst_height, + FilterMode filtering) { +#ifdef CPU_X86 + // environment variable overrides for testing. + char *filter_override = getenv("LIBYUV_FILTER"); + if (filter_override) { + filtering = (FilterMode)atoi(filter_override); // NOLINT + } +#endif + if (dst_width == src_width && dst_height == src_height) { + // Straight copy. + ARGBCopy(src, src_stride, dst, dst_stride, dst_width, dst_height); + return; + } + if (2 * dst_width == src_width && 2 * dst_height == src_height) { + // Optimized 1/2. + ScaleARGBDown2(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + int scale_down_x = src_width / dst_width; + int scale_down_y = src_height / dst_height; + if (dst_width * scale_down_x == src_width && + dst_height * scale_down_y == src_height) { + if (!(scale_down_x & 1) && !(scale_down_y & 1)) { + // Optimized even scale down. ie 4, 6, 8, 10x + ScaleARGBDownEven(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + if ((scale_down_x & 1) && (scale_down_y & 1)) { + filtering = kFilterNone; + } + } + // Arbitrary scale up and/or down. + ScaleARGBAnySize(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); +} + +// ScaleARGB an ARGB image. +LIBYUV_API +int ARGBScale(const uint8* src_argb, int src_stride_argb, + int src_width, int src_height, + uint8* dst_argb, int dst_stride_argb, + int dst_width, int dst_height, + FilterMode filtering) { + if (!src_argb || src_width <= 0 || src_height == 0 || + !dst_argb || dst_width <= 0 || dst_height <= 0) { + return -1; + } + // Negative height means invert the image. + if (src_height < 0) { + src_height = -src_height; + src_argb = src_argb + (src_height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + ScaleARGB(src_argb, src_stride_argb, src_width, src_height, + dst_argb, dst_stride_argb, dst_width, dst_height, + filtering); + return 0; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/files/source/scale_neon.cc b/files/source/scale_neon.cc new file mode 100644 index 00000000..a1946f05 --- /dev/null +++ b/files/source/scale_neon.cc @@ -0,0 +1,534 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC Neon +#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__) + +/** + * NEON downscalers with interpolation. + * + * Provided by Fritz Koenig + * + */ + +void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width) { + asm volatile ( + "1: \n" + // load even pixels into q0, odd into q1 + "vld2.u8 {q0,q1}, [%0]! \n" + "vst1.u8 {q0}, [%1]! \n" // store even pixels + "subs %2, %2, #16 \n" // 16 processed per loop + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1" // Clobber List + ); +} + +void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + // change the stride to row 2 pointer + "add %1, %0 \n" + "1: \n" + "vld1.u8 {q0,q1}, [%0]! \n" // load row 1 and post inc + "vld1.u8 {q2,q3}, [%1]! \n" // load row 2 and post inc + "vpaddl.u8 q0, q0 \n" // row 1 add adjacent + "vpaddl.u8 q1, q1 \n" + "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1 + "vpadal.u8 q1, q3 \n" + "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack + "vrshrn.u16 d1, q1, #2 \n" + "vst1.u8 {q0}, [%2]! \n" + "subs %3, %3, #16 \n" // 16 processed per loop + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "q0", "q1", "q2", "q3" // Clobber List + ); +} + +void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "1: \n" + "vld2.u8 {d0, d1}, [%0]! \n" + "vtrn.u8 d1, d0 \n" + "vshrn.u16 d0, q0, #8 \n" + "vst1.u32 {d0[1]}, [%1]! \n" + "subs %2, #4 \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1", "memory", "cc" + ); +} + +void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "add r4, %0, %3 \n" + "add r5, r4, %3 \n" + "add %3, r5, %3 \n" + "1: \n" + "vld1.u8 {q0}, [%0]! \n" // load up 16x4 + "vld1.u8 {q1}, [r4]! \n" + "vld1.u8 {q2}, [r5]! \n" + "vld1.u8 {q3}, [%3]! \n" + "vpaddl.u8 q0, q0 \n" + "vpadal.u8 q0, q1 \n" + "vpadal.u8 q0, q2 \n" + "vpadal.u8 q0, q3 \n" + "vpaddl.u16 q0, q0 \n" + "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding + "vmovn.u16 d0, q0 \n" + "vst1.u32 {d0[0]}, [%1]! \n" + "subs %2, #4 \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(src_stride) // %3 + : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc" + ); +} + +// Down scale from 4 to 3 pixels. Use the neon multilane read/write +// to load up the every 4th pixel into a 4 different registers. +// Point samples 32 pixels to 24 pixels. +void ScaleRowDown34_NEON(const uint8* src_ptr, + ptrdiff_t /* src_stride */, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "1: \n" + "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vmov d2, d3 \n" // order d0, d1, d2 + "vst3.u8 {d0, d1, d2}, [%1]! \n" + "subs %2, #24 \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "d0", "d1", "d2", "d3", "memory", "cc" + ); +} + +void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" + "1: \n" + "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + + // filter src line 0 with src line 1 + // expand chars to shorts to allow for room + // when adding lines together + "vmovl.u8 q8, d4 \n" + "vmovl.u8 q9, d5 \n" + "vmovl.u8 q10, d6 \n" + "vmovl.u8 q11, d7 \n" + + // 3 * line_0 + line_1 + "vmlal.u8 q8, d0, d24 \n" + "vmlal.u8 q9, d1, d24 \n" + "vmlal.u8 q10, d2, d24 \n" + "vmlal.u8 q11, d3, d24 \n" + + // (3 * line_0 + line_1) >> 2 + "vqrshrn.u16 d0, q8, #2 \n" + "vqrshrn.u16 d1, q9, #2 \n" + "vqrshrn.u16 d2, q10, #2 \n" + "vqrshrn.u16 d3, q11, #2 \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "vmovl.u8 q8, d1 \n" + "vmlal.u8 q8, d0, d24 \n" + "vqrshrn.u16 d0, q8, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "vrhadd.u8 d1, d1, d2 \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "vmovl.u8 q8, d2 \n" + "vmlal.u8 q8, d3, d24 \n" + "vqrshrn.u16 d2, q8, #2 \n" + + "vst3.u8 {d0, d1, d2}, [%1]! \n" + + "subs %2, #24 \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc" + ); +} + +void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" + "1: \n" + "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + + // average src line 0 with src line 1 + "vrhadd.u8 q0, q0, q2 \n" + "vrhadd.u8 q1, q1, q3 \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "vmovl.u8 q3, d1 \n" + "vmlal.u8 q3, d0, d24 \n" + "vqrshrn.u16 d0, q3, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "vrhadd.u8 d1, d1, d2 \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "vmovl.u8 q3, d2 \n" + "vmlal.u8 q3, d3, d24 \n" + "vqrshrn.u16 d2, q3, #2 \n" + + "vst3.u8 {d0, d1, d2}, [%1]! \n" + + "subs %2, #24 \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc" + ); +} + +#define HAS_SCALEROWDOWN38_NEON +const uvec8 kShuf38 = + { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; +const uvec8 kShuf38_2 = + { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 }; +const vec16 kMult38_Div6 = + { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, + 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }; +const vec16 kMult38_Div9 = + { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, + 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; + +// 32 -> 12 +void ScaleRowDown38_NEON(const uint8* src_ptr, + ptrdiff_t /* src_stride */, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vld1.u8 {q3}, [%3] \n" + "1: \n" + "vld1.u8 {d0, d1, d2, d3}, [%0]! \n" + "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" + "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" + "vst1.u8 {d4}, [%1]! \n" + "vst1.u32 {d5[0]}, [%1]! \n" + "subs %2, #12 \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(&kShuf38) // %3 + : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc" + ); +} + +// 32x3 -> 12x1 +void OMITFP ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vld1.u16 {q13}, [%4] \n" + "vld1.u8 {q14}, [%5] \n" + "vld1.u8 {q15}, [%6] \n" + "add r4, %0, %3, lsl #1 \n" + "add %3, %0 \n" + "1: \n" + + // d0 = 00 40 01 41 02 42 03 43 + // d1 = 10 50 11 51 12 52 13 53 + // d2 = 20 60 21 61 22 62 23 63 + // d3 = 30 70 31 71 32 72 33 73 + "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" + "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" + "vld4.u8 {d16, d17, d18, d19}, [r4]! \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // d0 = 00 10 01 11 02 12 03 13 + // d1 = 40 50 41 51 42 52 43 53 + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" + "vtrn.u8 d16, d17 \n" + + // d2 = 20 30 21 31 22 32 23 33 + // d3 = 60 70 61 71 62 72 63 73 + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" + "vtrn.u8 d18, d19 \n" + + // d0 = 00+10 01+11 02+12 03+13 + // d2 = 40+50 41+51 42+52 43+53 + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" + "vpaddl.u8 q8, q8 \n" + + // d3 = 60+70 61+71 62+72 63+73 + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" + "vpaddl.u8 d19, d19 \n" + + // combine source lines + "vadd.u16 q0, q2 \n" + "vadd.u16 q0, q8 \n" + "vadd.u16 d4, d3, d7 \n" + "vadd.u16 d4, d19 \n" + + // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] + // + s[6 + st * 1] + s[7 + st * 1] + // + s[6 + st * 2] + s[7 + st * 2]) / 6 + "vqrdmulh.s16 q2, q2, q13 \n" + "vmovn.u16 d4, q2 \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" + "vmovl.u8 q9, d18 \n" + + // combine source lines + "vadd.u16 q1, q3 \n" + "vadd.u16 q1, q9 \n" + + // d4 = xx 20 xx 30 xx 22 xx 32 + // d5 = xx 21 xx 31 xx 23 xx 33 + "vtrn.u32 d2, d3 \n" + + // d4 = xx 20 xx 21 xx 22 xx 23 + // d5 = xx 30 xx 31 xx 32 xx 33 + "vtrn.u16 d2, d3 \n" + + // 0+1+2, 3+4+5 + "vadd.u16 q0, q1 \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "vqrdmulh.s16 q0, q0, q15 \n" + + // Align for table lookup, vtbl requires registers to + // be adjacent + "vmov.u8 d2, d4 \n" + + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + + "vst1.u8 {d3}, [%1]! \n" + "vst1.u32 {d4[0]}, [%1]! \n" + "subs %2, #12 \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : "r"(&kMult38_Div6), // %4 + "r"(&kShuf38_2), // %5 + "r"(&kMult38_Div9) // %6 + : "r4", "q0", "q1", "q2", "q3", "q8", "q9", + "q13", "q14", "q15", "memory", "cc" + ); +} + +// 32x2 -> 12x1 +void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vld1.u16 {q13}, [%4] \n" + "vld1.u8 {q14}, [%5] \n" + "add %3, %0 \n" + "1: \n" + + // d0 = 00 40 01 41 02 42 03 43 + // d1 = 10 50 11 51 12 52 13 53 + // d2 = 20 60 21 61 22 62 23 63 + // d3 = 30 70 31 71 32 72 33 73 + "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" + "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // d0 = 00 10 01 11 02 12 03 13 + // d1 = 40 50 41 51 42 52 43 53 + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" + + // d2 = 20 30 21 31 22 32 23 33 + // d3 = 60 70 61 71 62 72 63 73 + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" + + // d0 = 00+10 01+11 02+12 03+13 + // d2 = 40+50 41+51 42+52 43+53 + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" + + // d3 = 60+70 61+71 62+72 63+73 + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" + + // combine source lines + "vadd.u16 q0, q2 \n" + "vadd.u16 d4, d3, d7 \n" + + // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 + "vqrshrn.u16 d4, q2, #2 \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" + + // combine source lines + "vadd.u16 q1, q3 \n" + + // d4 = xx 20 xx 30 xx 22 xx 32 + // d5 = xx 21 xx 31 xx 23 xx 33 + "vtrn.u32 d2, d3 \n" + + // d4 = xx 20 xx 21 xx 22 xx 23 + // d5 = xx 30 xx 31 xx 32 xx 33 + "vtrn.u16 d2, d3 \n" + + // 0+1+2, 3+4+5 + "vadd.u16 q0, q1 \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "vqrdmulh.s16 q0, q0, q13 \n" + + // Align for table lookup, vtbl requires registers to + // be adjacent + "vmov.u8 d2, d4 \n" + + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + + "vst1.u8 {d3}, [%1]! \n" + "vst1.u32 {d4[0]}, [%1]! \n" + "subs %2, #12 \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : "r"(&kMult38_Div6), // %4 + "r"(&kShuf38_2) // %5 + : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc" + ); +} + +// 16x2 -> 16x1 +void ScaleFilterRows_NEON(uint8* dst_ptr, + const uint8* src_ptr, ptrdiff_t src_stride, + int dst_width, int source_y_fraction) { + asm volatile ( + "cmp %4, #0 \n" + "beq 2f \n" + "add %2, %1 \n" + "cmp %4, #128 \n" + "beq 3f \n" + + "vdup.8 d5, %4 \n" + "rsb %4, #256 \n" + "vdup.8 d4, %4 \n" + "1: \n" + "vld1.u8 {q0}, [%1]! \n" + "vld1.u8 {q1}, [%2]! \n" + "subs %3, #16 \n" + "vmull.u8 q13, d0, d4 \n" + "vmull.u8 q14, d1, d4 \n" + "vmlal.u8 q13, d2, d5 \n" + "vmlal.u8 q14, d3, d5 \n" + "vrshrn.u16 d0, q13, #8 \n" + "vrshrn.u16 d1, q14, #8 \n" + "vst1.u8 {q0}, [%0]! \n" + "bgt 1b \n" + "b 4f \n" + + "2: \n" + "vld1.u8 {q0}, [%1]! \n" + "subs %3, #16 \n" + "vst1.u8 {q0}, [%0]! \n" + "bgt 2b \n" + "b 4f \n" + + "3: \n" + "vld1.u8 {q0}, [%1]! \n" + "vld1.u8 {q1}, [%2]! \n" + "subs %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.u8 {q0}, [%0]! \n" + "bgt 3b \n" + "4: \n" + "vst1.u8 {d1[7]}, [%0] \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_stride), // %2 + "+r"(dst_width), // %3 + "+r"(source_y_fraction) // %4 + : + : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc" + ); +} + +#endif // __ARM_NEON__ + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + diff --git a/files/source/video_common.cc b/files/source/video_common.cc index 8b8ee622..616affd1 100644 --- a/files/source/video_common.cc +++ b/files/source/video_common.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -9,13 +9,14 @@ */ -#include "video_common.h" - -#include <sstream> +#include "libyuv/video_common.h" +#ifdef __cplusplus namespace libyuv { +extern "C" { +#endif -#define ARRAY_SIZE(x) (static_cast<int>((sizeof(x)/sizeof(x[0])))) +#define ARRAY_SIZE(x) (static_cast<int>((sizeof(x) / sizeof(x[0])))) struct FourCCAliasEntry { uint32 alias; @@ -24,7 +25,8 @@ struct FourCCAliasEntry { static const FourCCAliasEntry kFourCCAliases[] = { {FOURCC_IYUV, FOURCC_I420}, - {FOURCC_YU12, FOURCC_I420}, + {FOURCC_YU16, FOURCC_I422}, + {FOURCC_YU24, FOURCC_I444}, {FOURCC_YUYV, FOURCC_YUY2}, {FOURCC_YUVS, FOURCC_YUY2}, {FOURCC_HDYC, FOURCC_UYVY}, @@ -35,6 +37,7 @@ static const FourCCAliasEntry kFourCCAliases[] = { {FOURCC_BGR3, FOURCC_24BG}, }; +LIBYUV_API uint32 CanonicalFourCC(uint32 fourcc) { for (int i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) { if (kFourCCAliases[i].alias == fourcc) { @@ -45,4 +48,8 @@ uint32 CanonicalFourCC(uint32 fourcc) { return fourcc; } +#ifdef __cplusplus +} // extern "C" } // namespace libyuv +#endif + diff --git a/files/source/video_common.h b/files/source/video_common.h deleted file mode 100644 index 9fe08a03..00000000 --- a/files/source/video_common.h +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -/* -* Common definitions for video, including fourcc and VideoFormat -*/ - - -#ifndef LIBYUV_SOURCE_VIDEO_COMMON_H_ -#define LIBYUV_SOURCE_VIDEO_COMMON_H_ - -#include <string> - -#include "libyuv/basic_types.h" - -namespace libyuv { - -////////////////////////////////////////////////////////////////////////////// -// Definition of fourcc. -////////////////////////////////////////////////////////////////////////////// -// Convert four characters to a fourcc code. -// Needs to be a macro otherwise the OS X compiler complains when the kFormat* -// constants are used in a switch. -#define FOURCC(a, b, c, d) (\ - (static_cast<uint32>(a)) | (static_cast<uint32>(b) << 8) | \ - (static_cast<uint32>(c) << 16) | (static_cast<uint32>(d) << 24)) - -// Some good pages discussing FourCC codes: -// http://developer.apple.com/quicktime/icefloe/dispatch020.html -// http://www.fourcc.org/yuv.php -enum FourCC { - // Canonical fourcc codes used in our code. - FOURCC_I420 = FOURCC('I', '4', '2', '0'), - FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'), - FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'), - FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'), - FOURCC_M420 = FOURCC('M', '4', '2', '0'), - FOURCC_24BG = FOURCC('2', '4', 'B', 'G'), - FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'), - FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'), - FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'), - FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'), - FOURCC_RAW = FOURCC('r', 'a', 'w', ' '), - FOURCC_NV21 = FOURCC('N', 'V', '2', '1'), - FOURCC_NV12 = FOURCC('N', 'V', '1', '2'), - // Next four are Bayer RGB formats. The four characters define the order of - // the colours in each 2x2 pixel grid, going left-to-right and top-to-bottom. - FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'), - FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'), - FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'), - FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'), - - // Aliases for canonical fourcc codes, replaced with their canonical - // equivalents by CanonicalFourCC(). - FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420 - FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'), // Alias for I420 - FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'), // Alias for YUY2 - FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'), // Alias for YUY2 on Mac - FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'), // Alias for UYVY - FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'), // Alias for UYVY - FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'), // Alias for MJPG - FOURCC_BA81 = FOURCC('B', 'A', '8', '1'), // Alias for BGGR - FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'), // Alias for RAW - FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'), // Alias for 24BG - - // Match any fourcc. - FOURCC_ANY = 0xFFFFFFFF, -}; - -// Converts fourcc aliases into canonical ones. -uint32 CanonicalFourCC(uint32 fourcc); - -} // namespace libyuv - -#endif // LIBYUV_SOURCE_VIDEO_COMMON_H_ diff --git a/files/unit_test/compare_test.cc b/files/unit_test/compare_test.cc new file mode 100644 index 00000000..8a49a612 --- /dev/null +++ b/files/unit_test/compare_test.cc @@ -0,0 +1,450 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stdlib.h> +#include <string.h> +#include <time.h> + +#include "../unit_test/unit_test.h" +#include "libyuv/basic_types.h" +#include "libyuv/compare.h" +#include "libyuv/cpu_id.h" + +namespace libyuv { + +// hash seed of 5381 recommended. +static uint32 ReferenceHashDjb2(const uint8* src, uint64 count, uint32 seed) { + uint32 hash = seed; + if (count > 0) { + do { + hash = hash * 33 + *src++; + } while (--count); + } + return hash; +} + +TEST_F(libyuvTest, TestDjb2) { + const int kMaxTest = 2049; + align_buffer_16(src_a, kMaxTest) + + for (int i = 0; i < kMaxTest; ++i) { + src_a[i] = i; + } + for (int i = 0; i < kMaxTest; ++i) { + uint32 h1 = HashDjb2(src_a, kMaxTest, 5381); + uint32 h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381); + EXPECT_EQ(h1, h2); + } + // Hash constant generator using for tables in compare + int h = 1; + for (int i = 0; i <= 16 ; ++i) { + printf("%08x ", h); + h *= 33; + } + printf("\n"); + + free_aligned_buffer_16(src_a) +} + +TEST_F(libyuvTest, BenchmakDjb2_C) { + const int kMaxTest = 1280 * 720; + align_buffer_16(src_a, kMaxTest) + + for (int i = 0; i < kMaxTest; ++i) { + src_a[i] = i; + } + uint32 h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381); + uint32 h1; + MaskCpuFlags(kCpuInitialized); + for (int i = 0; i < benchmark_iterations_; ++i) { + h1 = HashDjb2(src_a, kMaxTest, 5381); + } + MaskCpuFlags(-1); + EXPECT_EQ(h1, h2); + free_aligned_buffer_16(src_a) +} + +TEST_F(libyuvTest, BenchmakDjb2_OPT) { + const int kMaxTest = 1280 * 720; + align_buffer_16(src_a, kMaxTest) + + for (int i = 0; i < kMaxTest; ++i) { + src_a[i] = i; + } + uint32 h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381); + uint32 h1; + for (int i = 0; i < benchmark_iterations_; ++i) { + h1 = HashDjb2(src_a, kMaxTest, 5381); + } + EXPECT_EQ(h1, h2); + free_aligned_buffer_16(src_a) +} + +TEST_F(libyuvTest, BenchmakDjb2_Unaligned_OPT) { + const int kMaxTest = 1280 * 720; + align_buffer_16(src_a, kMaxTest + 1) + + for (int i = 0; i < kMaxTest; ++i) { + src_a[i + 1] = i; + } + uint32 h2 = ReferenceHashDjb2(src_a + 1, kMaxTest, 5381); + uint32 h1; + for (int i = 0; i < benchmark_iterations_; ++i) { + h1 = HashDjb2(src_a + 1, kMaxTest, 5381); + } + EXPECT_EQ(h1, h2); + free_aligned_buffer_16(src_a) +} + +TEST_F(libyuvTest, BenchmarkSumSquareError_C) { + const int kMaxWidth = 4096 * 3; + align_buffer_16(src_a, kMaxWidth) + align_buffer_16(src_b, kMaxWidth) + + for (int i = 0; i < kMaxWidth; ++i) { + src_a[i] = i; + src_b[i] = i; + } + + MaskCpuFlags(kCpuInitialized); + for (int i = 0; i < benchmark_iterations_; ++i) { + ComputeSumSquareError(src_a, src_b, kMaxWidth); + } + + MaskCpuFlags(-1); + + EXPECT_EQ(0, 0); + + free_aligned_buffer_16(src_a) + free_aligned_buffer_16(src_b) +} + +TEST_F(libyuvTest, BenchmarkSumSquareError_OPT) { + const int kMaxWidth = 4096 * 3; + align_buffer_16(src_a, kMaxWidth) + align_buffer_16(src_b, kMaxWidth) + + for (int i = 0; i < kMaxWidth; ++i) { + src_a[i] = i; + src_b[i] = i; + } + + for (int i = 0; i < benchmark_iterations_; ++i) { + ComputeSumSquareError(src_a, src_b, kMaxWidth); + } + + EXPECT_EQ(0, 0); + + free_aligned_buffer_16(src_a) + free_aligned_buffer_16(src_b) +} + +TEST_F(libyuvTest, SumSquareError) { + const int kMaxWidth = 4096 * 3; + align_buffer_16(src_a, kMaxWidth) + align_buffer_16(src_b, kMaxWidth) + + memset(src_a, 0, kMaxWidth); + memset(src_b, 0, kMaxWidth); + + uint64 err; + err = ComputeSumSquareError(src_a, src_b, kMaxWidth); + + EXPECT_EQ(err, 0); + + memset(src_a, 1, kMaxWidth); + err = ComputeSumSquareError(src_a, src_b, kMaxWidth); + + EXPECT_EQ(err, kMaxWidth); + + memset(src_a, 190, kMaxWidth); + memset(src_b, 193, kMaxWidth); + err = ComputeSumSquareError(src_a, src_b, kMaxWidth); + + EXPECT_EQ(err, (kMaxWidth * 3 * 3)); + + srandom(time(NULL)); + + for (int i = 0; i < kMaxWidth; ++i) { + src_a[i] = (random() & 0xff); + src_b[i] = (random() & 0xff); + } + + MaskCpuFlags(kCpuInitialized); + uint64 c_err = ComputeSumSquareError(src_a, src_b, kMaxWidth); + + MaskCpuFlags(-1); + uint64 opt_err = ComputeSumSquareError(src_a, src_b, kMaxWidth); + + EXPECT_EQ(c_err, opt_err); + + free_aligned_buffer_16(src_a) + free_aligned_buffer_16(src_b) +} + +TEST_F(libyuvTest, BenchmarkPsnr_C) { + align_buffer_16(src_a, benchmark_width_ * benchmark_height_) + align_buffer_16(src_b, benchmark_width_ * benchmark_height_) + + for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) { + src_a[i] = i; + src_b[i] = i; + } + + MaskCpuFlags(kCpuInitialized); + + double c_time = get_time(); + for (int i = 0; i < benchmark_iterations_; ++i) + CalcFramePsnr(src_a, benchmark_width_, + src_b, benchmark_width_, + benchmark_width_, benchmark_height_); + + c_time = (get_time() - c_time) / benchmark_iterations_; + printf("BenchmarkPsnr_C - %8.2f us c\n", c_time * 1e6); + + MaskCpuFlags(-1); + + EXPECT_EQ(0, 0); + + free_aligned_buffer_16(src_a) + free_aligned_buffer_16(src_b) +} + +TEST_F(libyuvTest, BenchmarkPsnr_OPT) { + align_buffer_16(src_a, benchmark_width_ * benchmark_height_) + align_buffer_16(src_b, benchmark_width_ * benchmark_height_) + + for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) { + src_a[i] = i; + src_b[i] = i; + } + + MaskCpuFlags(-1); + + double opt_time = get_time(); + for (int i = 0; i < benchmark_iterations_; ++i) + CalcFramePsnr(src_a, benchmark_width_, + src_b, benchmark_width_, + benchmark_width_, benchmark_height_); + + opt_time = (get_time() - opt_time) / benchmark_iterations_; + printf("BenchmarkPsnr_OPT - %8.2f us opt\n", opt_time * 1e6); + + EXPECT_EQ(0, 0); + + free_aligned_buffer_16(src_a) + free_aligned_buffer_16(src_b) +} + +TEST_F(libyuvTest, Psnr) { + const int kSrcWidth = 1280; + const int kSrcHeight = 720; + const int b = 128; + const int kSrcPlaneSize = (kSrcWidth + b * 2) * (kSrcHeight + b * 2); + const int kSrcStride = 2 * b + kSrcWidth; + align_buffer_16(src_a, kSrcPlaneSize) + align_buffer_16(src_b, kSrcPlaneSize) + + memset(src_a, 0, kSrcPlaneSize); + memset(src_b, 0, kSrcPlaneSize); + + double err; + err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride, + src_b + kSrcStride * b + b, kSrcStride, + kSrcWidth, kSrcHeight); + + EXPECT_EQ(err, kMaxPsnr); + + memset(src_a, 255, kSrcPlaneSize); + + err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride, + src_b + kSrcStride * b + b, kSrcStride, + kSrcWidth, kSrcHeight); + + EXPECT_EQ(err, 0.0); + + memset(src_a, 1, kSrcPlaneSize); + + err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride, + src_b + kSrcStride * b + b, kSrcStride, + kSrcWidth, kSrcHeight); + + EXPECT_GT(err, 48.0); + EXPECT_LT(err, 49.0); + + for (int i = 0; i < kSrcPlaneSize; ++i) + src_a[i] = i; + + err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride, + src_b + kSrcStride * b + b, kSrcStride, + kSrcWidth, kSrcHeight); + + EXPECT_GT(err, 4.0); + EXPECT_LT(err, 5.0); + + srandom(time(NULL)); + + memset(src_a, 0, kSrcPlaneSize); + memset(src_b, 0, kSrcPlaneSize); + + for (int i = b; i < (kSrcHeight + b); ++i) { + for (int j = b; j < (kSrcWidth + b); ++j) { + src_a[(i * kSrcStride) + j] = (random() & 0xff); + src_b[(i * kSrcStride) + j] = (random() & 0xff); + } + } + + MaskCpuFlags(kCpuInitialized); + double c_err, opt_err; + + c_err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride, + src_b + kSrcStride * b + b, kSrcStride, + kSrcWidth, kSrcHeight); + + MaskCpuFlags(-1); + + opt_err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride, + src_b + kSrcStride * b + b, kSrcStride, + kSrcWidth, kSrcHeight); + + EXPECT_EQ(opt_err, c_err); + + free_aligned_buffer_16(src_a) + free_aligned_buffer_16(src_b) +} + +TEST_F(libyuvTest, BenchmarkSsim_C) { + align_buffer_16(src_a, benchmark_width_ * benchmark_height_) + align_buffer_16(src_b, benchmark_width_ * benchmark_height_) + + for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) { + src_a[i] = i; + src_b[i] = i; + } + + MaskCpuFlags(kCpuInitialized); + + double c_time = get_time(); + for (int i = 0; i < benchmark_iterations_; ++i) + CalcFrameSsim(src_a, benchmark_width_, + src_b, benchmark_width_, + benchmark_width_, benchmark_height_); + + c_time = (get_time() - c_time) / benchmark_iterations_; + printf("BenchmarkSsim_C - %8.2f us c\n", c_time * 1e6); + + MaskCpuFlags(-1); + + EXPECT_EQ(0, 0); + + free_aligned_buffer_16(src_a) + free_aligned_buffer_16(src_b) +} + +TEST_F(libyuvTest, BenchmarkSsim_OPT) { + align_buffer_16(src_a, benchmark_width_ * benchmark_height_) + align_buffer_16(src_b, benchmark_width_ * benchmark_height_) + + for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) { + src_a[i] = i; + src_b[i] = i; + } + + MaskCpuFlags(-1); + + double opt_time = get_time(); + for (int i = 0; i < benchmark_iterations_; ++i) + CalcFrameSsim(src_a, benchmark_width_, + src_b, benchmark_width_, + benchmark_width_, benchmark_height_); + + opt_time = (get_time() - opt_time) / benchmark_iterations_; + printf("BenchmarkPsnr_OPT - %8.2f us opt\n", opt_time * 1e6); + + EXPECT_EQ(0, 0); + + free_aligned_buffer_16(src_a) + free_aligned_buffer_16(src_b) +} + +TEST_F(libyuvTest, Ssim) { + const int kSrcWidth = 1280; + const int kSrcHeight = 720; + const int b = 128; + const int kSrcPlaneSize = (kSrcWidth + b * 2) * (kSrcHeight + b * 2); + const int kSrcStride = 2 * b + kSrcWidth; + align_buffer_16(src_a, kSrcPlaneSize) + align_buffer_16(src_b, kSrcPlaneSize) + + memset(src_a, 0, kSrcPlaneSize); + memset(src_b, 0, kSrcPlaneSize); + + double err; + err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride, + src_b + kSrcStride * b + b, kSrcStride, + kSrcWidth, kSrcHeight); + + EXPECT_EQ(err, 1.0); + + memset(src_a, 255, kSrcPlaneSize); + + err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride, + src_b + kSrcStride * b + b, kSrcStride, + kSrcWidth, kSrcHeight); + + EXPECT_LT(err, 0.0001); + + memset(src_a, 1, kSrcPlaneSize); + + err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride, + src_b + kSrcStride * b + b, kSrcStride, + kSrcWidth, kSrcHeight); + + EXPECT_GT(err, 0.8); + EXPECT_LT(err, 0.9); + + for (int i = 0; i < kSrcPlaneSize; ++i) + src_a[i] = i; + + err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride, + src_b + kSrcStride * b + b, kSrcStride, + kSrcWidth, kSrcHeight); + + EXPECT_GT(err, 0.008); + EXPECT_LT(err, 0.009); + + srandom(time(NULL)); + for (int i = b; i < (kSrcHeight + b); ++i) { + for (int j = b; j < (kSrcWidth + b); ++j) { + src_a[(i * kSrcStride) + j] = (random() & 0xff); + src_b[(i * kSrcStride) + j] = (random() & 0xff); + } + } + + MaskCpuFlags(kCpuInitialized); + double c_err, opt_err; + + c_err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride, + src_b + kSrcStride * b + b, kSrcStride, + kSrcWidth, kSrcHeight); + + MaskCpuFlags(-1); + + opt_err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride, + src_b + kSrcStride * b + b, kSrcStride, + kSrcWidth, kSrcHeight); + + EXPECT_EQ(opt_err, c_err); + + free_aligned_buffer_16(src_a) + free_aligned_buffer_16(src_b) +} + +} // namespace libyuv diff --git a/files/unit_test/cpu_test.cc b/files/unit_test/cpu_test.cc new file mode 100644 index 00000000..52810e80 --- /dev/null +++ b/files/unit_test/cpu_test.cc @@ -0,0 +1,100 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stdlib.h> +#include <string.h> + +#include "libyuv/basic_types.h" +#include "libyuv/cpu_id.h" +#include "libyuv/version.h" +#include "../unit_test/unit_test.h" + +namespace libyuv { + +TEST_F(libyuvTest, TestCpuHas) { + int cpu_flags = TestCpuFlag(~kCpuInitialized); + printf("Cpu Flags %x\n", cpu_flags); + int has_arm = TestCpuFlag(kCpuHasARM); + printf("Has ARM %x\n", has_arm); + int has_neon = TestCpuFlag(kCpuHasNEON); + printf("Has NEON %x\n", has_neon); + int has_x86 = TestCpuFlag(kCpuHasX86); + printf("Has X86 %x\n", has_x86); + int has_sse2 = TestCpuFlag(kCpuHasSSE2); + printf("Has SSE2 %x\n", has_sse2); + int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); + printf("Has SSSE3 %x\n", has_ssse3); + int has_sse41 = TestCpuFlag(kCpuHasSSE41); + printf("Has SSE4.1 %x\n", has_sse41); + int has_sse42 = TestCpuFlag(kCpuHasSSE42); + printf("Has SSE4.2 %x\n", has_sse42); + int has_avx = TestCpuFlag(kCpuHasAVX); + printf("Has AVX %x\n", has_avx); + int has_avx2 = TestCpuFlag(kCpuHasAVX2); + printf("Has AVX2 %x\n", has_avx2); +} + +#if defined(__i386__) || defined(__x86_64__) || \ + defined(_M_IX86) || defined(_M_X64) +TEST_F(libyuvTest, TestCpuId) { + int has_x86 = TestCpuFlag(kCpuHasX86); + if (has_x86) { + int cpu_info[4]; + // Vendor ID: + // AuthenticAMD AMD processor + // CentaurHauls Centaur processor + // CyrixInstead Cyrix processor + // GenuineIntel Intel processor + // GenuineTMx86 Transmeta processor + // Geode by NSC National Semiconductor processor + // NexGenDriven NexGen processor + // RiseRiseRise Rise Technology processor + // SiS SiS SiS SiS processor + // UMC UMC UMC UMC processor + CpuId(cpu_info, 0); + cpu_info[0] = cpu_info[1]; // Reorder output + cpu_info[1] = cpu_info[3]; + cpu_info[3] = 0; + printf("Cpu Vendor: %s %x %x %x\n", reinterpret_cast<char*>(&cpu_info[0]), + cpu_info[0], cpu_info[1], cpu_info[2]); + EXPECT_EQ(12, strlen(reinterpret_cast<char*>(&cpu_info[0]))); + + // CPU Family and Model + // 3:0 - Stepping + // 7:4 - Model + // 11:8 - Family + // 13:12 - Processor Type + // 19:16 - Extended Model + // 27:20 - Extended Family + CpuId(cpu_info, 1); + int family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0); + int model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0); + printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family, + model, model); + } +} +#endif + +TEST_F(libyuvTest, TestLinuxNeon) { + int testdata = ArmCpuCaps("unit_test/testdata/arm_v7.txt"); + if (testdata) { + EXPECT_EQ(kCpuInitialized, + ArmCpuCaps("unit_test/testdata/arm_v7.txt")); + EXPECT_EQ((kCpuInitialized | kCpuHasNEON), + ArmCpuCaps("unit_test/testdata/tegra3.txt")); + } else { + printf("WARNING: unable to load \"unit_test/testdata/arm_v7.txt\"\n"); + } +#if defined(__linux__) && defined(__ARM_NEON__) + EXPECT_NE(0, ArmCpuCaps("/proc/cpuinfo")); +#endif +} + +} // namespace libyuv diff --git a/files/unit_test/planar_test.cc b/files/unit_test/planar_test.cc new file mode 100644 index 00000000..e9053a35 --- /dev/null +++ b/files/unit_test/planar_test.cc @@ -0,0 +1,1005 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stdlib.h> +#include <time.h> + +#include "libyuv/convert_argb.h" +#include "libyuv/convert_from.h" +#include "libyuv/compare.h" +#include "libyuv/cpu_id.h" +#include "libyuv/format_conversion.h" +#include "libyuv/planar_functions.h" +#include "libyuv/rotate.h" +#include "../unit_test/unit_test.h" + +#if defined(_MSC_VER) +#define SIMD_ALIGNED(var) __declspec(align(16)) var +#else // __GNUC__ +#define SIMD_ALIGNED(var) var __attribute__((aligned(16))) +#endif + +namespace libyuv { + +#define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, N, NEG) \ +TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N##_OptVsC) { \ + const int kWidth = 1280; \ + const int kHeight = 720; \ + const int kStride = (kWidth * 8 * BPP_B + 7) / 8; \ + align_buffer_16(src_y, kWidth * kHeight); \ + align_buffer_16(src_u, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \ + align_buffer_16(src_v, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \ + align_buffer_16(dst_argb_c, kStride * kHeight); \ + align_buffer_16(dst_argb_opt, kStride * kHeight); \ + srandom(time(NULL)); \ + for (int i = 0; i < kHeight; ++i) \ + for (int j = 0; j < kWidth; ++j) \ + src_y[(i * kWidth) + j] = (random() & 0xff); \ + for (int i = 0; i < kHeight / SUBSAMP_Y; ++i) \ + for (int j = 0; j < kWidth / SUBSAMP_X; ++j) { \ + src_u[(i * kWidth / SUBSAMP_X) + j] = (random() & 0xff); \ + src_v[(i * kWidth / SUBSAMP_X) + j] = (random() & 0xff); \ + } \ + MaskCpuFlags(kCpuInitialized); \ + FMT_PLANAR##To##FMT_B(src_y, kWidth, \ + src_u, kWidth / SUBSAMP_X, \ + src_v, kWidth / SUBSAMP_X, \ + dst_argb_c, kStride, \ + kWidth, NEG kHeight); \ + MaskCpuFlags(-1); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_PLANAR##To##FMT_B(src_y, kWidth, \ + src_u, kWidth / SUBSAMP_X, \ + src_v, kWidth / SUBSAMP_X, \ + dst_argb_opt, kStride, \ + kWidth, NEG kHeight); \ + } \ + int max_diff = 0; \ + for (int i = 0; i < kHeight; ++i) { \ + for (int j = 0; j < kWidth * BPP_B; ++j) { \ + int abs_diff = \ + abs(static_cast<int>(dst_argb_c[i * kWidth * BPP_B + j]) - \ + static_cast<int>(dst_argb_opt[i * kWidth * BPP_B + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, 2); \ + free_aligned_buffer_16(src_y) \ + free_aligned_buffer_16(src_u) \ + free_aligned_buffer_16(src_v) \ + free_aligned_buffer_16(dst_argb_c) \ + free_aligned_buffer_16(dst_argb_opt) \ +} + +#define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B) \ + TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, , +) \ + TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, Invert, -) + +TESTPLANARTOB(I420, 2, 2, ARGB, 4) +TESTPLANARTOB(I420, 2, 2, BGRA, 4) +TESTPLANARTOB(I420, 2, 2, ABGR, 4) +TESTPLANARTOB(I420, 2, 2, RGBA, 4) +TESTPLANARTOB(I420, 2, 2, RAW, 3) +TESTPLANARTOB(I420, 2, 2, RGB24, 3) +TESTPLANARTOB(I420, 2, 2, RGB565, 2) +TESTPLANARTOB(I420, 2, 2, ARGB1555, 2) +TESTPLANARTOB(I420, 2, 2, ARGB4444, 2) +TESTPLANARTOB(I422, 2, 1, ARGB, 4) +TESTPLANARTOB(I422, 2, 1, BGRA, 4) +TESTPLANARTOB(I422, 2, 1, ABGR, 4) +TESTPLANARTOB(I422, 2, 1, RGBA, 4) +TESTPLANARTOB(I411, 4, 1, ARGB, 4) +TESTPLANARTOB(I444, 1, 1, ARGB, 4) +TESTPLANARTOB(I420, 2, 2, YUY2, 2) +TESTPLANARTOB(I420, 2, 2, UYVY, 2) +// TODO(fbarchard): Re-enable test and fix valgrind. +// TESTPLANARTOB(I420, 2, 2, V210, 16 / 6) +TESTPLANARTOB(I420, 2, 2, I400, 1) +TESTPLANARTOB(I420, 2, 2, BayerBGGR, 1) +TESTPLANARTOB(I420, 2, 2, BayerRGGB, 1) +TESTPLANARTOB(I420, 2, 2, BayerGBRG, 1) +TESTPLANARTOB(I420, 2, 2, BayerGRBG, 1) + +#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ + N, NEG) \ +TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N##_OptVsC) { \ + const int kWidth = 1280; \ + const int kHeight = 720; \ + align_buffer_16(src_y, kWidth * kHeight); \ + align_buffer_16(src_uv, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y * 2); \ + align_buffer_16(dst_argb_c, (kWidth * BPP_B) * kHeight); \ + align_buffer_16(dst_argb_opt, (kWidth * BPP_B) * kHeight); \ + srandom(time(NULL)); \ + for (int i = 0; i < kHeight; ++i) \ + for (int j = 0; j < kWidth; ++j) \ + src_y[(i * kWidth) + j] = (random() & 0xff); \ + for (int i = 0; i < kHeight / SUBSAMP_Y; ++i) \ + for (int j = 0; j < kWidth / SUBSAMP_X * 2; ++j) { \ + src_uv[(i * kWidth / SUBSAMP_X) * 2 + j] = (random() & 0xff); \ + } \ + MaskCpuFlags(kCpuInitialized); \ + FMT_PLANAR##To##FMT_B(src_y, kWidth, \ + src_uv, kWidth / SUBSAMP_X * 2, \ + dst_argb_c, kWidth * BPP_B, \ + kWidth, NEG kHeight); \ + MaskCpuFlags(-1); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_PLANAR##To##FMT_B(src_y, kWidth, \ + src_uv, kWidth / SUBSAMP_X * 2, \ + dst_argb_opt, kWidth * BPP_B, \ + kWidth, NEG kHeight); \ + } \ + int max_diff = 0; \ + for (int i = 0; i < kHeight; ++i) { \ + for (int j = 0; j < kWidth * BPP_B; ++j) { \ + int abs_diff = \ + abs(static_cast<int>(dst_argb_c[i * kWidth * BPP_B + j]) - \ + static_cast<int>(dst_argb_opt[i * kWidth * BPP_B + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, 3); \ + free_aligned_buffer_16(src_y) \ + free_aligned_buffer_16(src_uv) \ + free_aligned_buffer_16(dst_argb_c) \ + free_aligned_buffer_16(dst_argb_opt) \ +} + +#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B) \ + TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, , +) \ + TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, Invert, -) + +TESTBIPLANARTOB(NV12, 2, 2, ARGB, 4) +TESTBIPLANARTOB(NV21, 2, 2, ARGB, 4) +TESTBIPLANARTOB(NV12, 2, 2, RGB565, 2) +TESTBIPLANARTOB(NV21, 2, 2, RGB565, 2) + +#define TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, N, NEG) \ +TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N##_OptVsC) { \ + const int kWidth = 1280; \ + const int kHeight = 720; \ + const int kStride = (kWidth * 8 * BPP_A + 7) / 8; \ + align_buffer_16(src_argb, kStride * kHeight); \ + align_buffer_16(dst_y_c, kWidth * kHeight); \ + align_buffer_16(dst_u_c, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \ + align_buffer_16(dst_v_c, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \ + align_buffer_16(dst_y_opt, kWidth * kHeight); \ + align_buffer_16(dst_u_opt, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \ + align_buffer_16(dst_v_opt, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \ + srandom(time(NULL)); \ + for (int i = 0; i < kHeight; ++i) \ + for (int j = 0; j < kStride; ++j) \ + src_argb[(i * kStride) + j] = (random() & 0xff); \ + MaskCpuFlags(kCpuInitialized); \ + FMT_A##To##FMT_PLANAR(src_argb, kStride, \ + dst_y_c, kWidth, \ + dst_u_c, kWidth / SUBSAMP_X, \ + dst_v_c, kWidth / SUBSAMP_X, \ + kWidth, NEG kHeight); \ + MaskCpuFlags(-1); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_A##To##FMT_PLANAR(src_argb, kStride, \ + dst_y_opt, kWidth, \ + dst_u_opt, kWidth / SUBSAMP_X, \ + dst_v_opt, kWidth / SUBSAMP_X, \ + kWidth, NEG kHeight); \ + } \ + int max_diff = 0; \ + for (int i = 0; i < kHeight; ++i) { \ + for (int j = 0; j < kWidth; ++j) { \ + int abs_diff = \ + abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \ + static_cast<int>(dst_y_opt[i * kWidth + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, 2); \ + for (int i = 0; i < kHeight / SUBSAMP_Y; ++i) { \ + for (int j = 0; j < kWidth / SUBSAMP_X; ++j) { \ + int abs_diff = \ + abs(static_cast<int>(dst_u_c[i * kWidth / SUBSAMP_X + j]) - \ + static_cast<int>(dst_u_opt[i * kWidth / SUBSAMP_X + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, 2); \ + for (int i = 0; i < kHeight / SUBSAMP_Y; ++i) { \ + for (int j = 0; j < kWidth / SUBSAMP_X; ++j) { \ + int abs_diff = \ + abs(static_cast<int>(dst_v_c[i * kWidth / SUBSAMP_X + j]) - \ + static_cast<int>(dst_v_opt[i * kWidth / SUBSAMP_X + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, 2); \ + free_aligned_buffer_16(dst_y_c) \ + free_aligned_buffer_16(dst_u_c) \ + free_aligned_buffer_16(dst_v_c) \ + free_aligned_buffer_16(dst_y_opt) \ + free_aligned_buffer_16(dst_u_opt) \ + free_aligned_buffer_16(dst_v_opt) \ + free_aligned_buffer_16(src_argb) \ +} + +#define TESTATOPLANAR(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \ + TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, , +) \ + TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, Invert, -) + +TESTATOPLANAR(ARGB, 4, I420, 2, 2) +TESTATOPLANAR(BGRA, 4, I420, 2, 2) +TESTATOPLANAR(ABGR, 4, I420, 2, 2) +TESTATOPLANAR(RGBA, 4, I420, 2, 2) +TESTATOPLANAR(RAW, 3, I420, 2, 2) +TESTATOPLANAR(RGB24, 3, I420, 2, 2) +TESTATOPLANAR(RGB565, 2, I420, 2, 2) +TESTATOPLANAR(ARGB1555, 2, I420, 2, 2) +TESTATOPLANAR(ARGB4444, 2, I420, 2, 2) +// TESTATOPLANAR(ARGB, 4, I411, 4, 1) +TESTATOPLANAR(ARGB, 4, I422, 2, 1) +// TESTATOPLANAR(ARGB, 4, I444, 1, 1) +// TODO(fbarchard): Implement and test 411 and 444 +TESTATOPLANAR(YUY2, 2, I420, 2, 2) +TESTATOPLANAR(UYVY, 2, I420, 2, 2) +TESTATOPLANAR(YUY2, 2, I422, 2, 1) +TESTATOPLANAR(UYVY, 2, I422, 2, 1) +TESTATOPLANAR(V210, 16 / 6, I420, 2, 2) +TESTATOPLANAR(I400, 1, I420, 2, 2) +TESTATOPLANAR(BayerBGGR, 1, I420, 2, 2) +TESTATOPLANAR(BayerRGGB, 1, I420, 2, 2) +TESTATOPLANAR(BayerGBRG, 1, I420, 2, 2) +TESTATOPLANAR(BayerGRBG, 1, I420, 2, 2) + +#define TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, N, NEG) \ +TEST_F(libyuvTest, FMT_A##To##FMT_B##N##_OptVsC) { \ + const int kWidth = 1280; \ + const int kHeight = 720; \ + align_buffer_16(src_argb, (kWidth * BPP_A) * kHeight); \ + align_buffer_16(dst_argb_c, (kWidth * BPP_B) * kHeight); \ + align_buffer_16(dst_argb_opt, (kWidth * BPP_B) * kHeight); \ + srandom(time(NULL)); \ + for (int i = 0; i < kHeight * kWidth * BPP_A; ++i) { \ + src_argb[i] = (random() & 0xff); \ + } \ + MaskCpuFlags(kCpuInitialized); \ + FMT_A##To##FMT_B(src_argb, kWidth * STRIDE_A, \ + dst_argb_c, kWidth * BPP_B, \ + kWidth, NEG kHeight); \ + MaskCpuFlags(-1); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_A##To##FMT_B(src_argb, kWidth * STRIDE_A, \ + dst_argb_opt, kWidth * BPP_B, \ + kWidth, NEG kHeight); \ + } \ + int max_diff = 0; \ + for (int i = 0; i < kHeight * kWidth * BPP_B; ++i) { \ + int abs_diff = \ + abs(static_cast<int>(dst_argb_c[i]) - \ + static_cast<int>(dst_argb_opt[i])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + EXPECT_LE(max_diff, 2); \ + free_aligned_buffer_16(src_argb) \ + free_aligned_buffer_16(dst_argb_c) \ + free_aligned_buffer_16(dst_argb_opt) \ +} +#define TESTATOB(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B) \ + TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, , +) \ + TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, Invert, -) + +TESTATOB(I400, 1, 1, I400, 1) +TESTATOB(ARGB, 4, 4, ARGB, 4) +TESTATOB(ARGB, 4, 4, BGRA, 4) +TESTATOB(ARGB, 4, 4, ABGR, 4) +TESTATOB(ARGB, 4, 4, RGBA, 4) +TESTATOB(ARGB, 4, 4, RAW, 3) +TESTATOB(ARGB, 4, 4, RGB24, 3) +TESTATOB(ARGB, 4, 4, RGB565, 2) +TESTATOB(ARGB, 4, 4, ARGB1555, 2) +TESTATOB(ARGB, 4, 4, ARGB4444, 2) +TESTATOB(BGRA, 4, 4, ARGB, 4) +TESTATOB(ABGR, 4, 4, ARGB, 4) +TESTATOB(RGBA, 4, 4, ARGB, 4) +TESTATOB(RAW, 3, 3, ARGB, 4) +TESTATOB(RGB24, 3, 3, ARGB, 4) +TESTATOB(RGB565, 2, 2, ARGB, 4) +TESTATOB(ARGB1555, 2, 2, ARGB, 4) +TESTATOB(ARGB4444, 2, 2, ARGB, 4) +TESTATOB(YUY2, 2, 2, ARGB, 4) +TESTATOB(UYVY, 2, 2, ARGB, 4) +TESTATOB(M420, 3 / 2, 1, ARGB, 4) + +static const int kReadPad = 16; // Allow overread of 16 bytes. +#define TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B) \ +TEST_F(libyuvTest, FMT_A##To##FMT_B##_Random) { \ + srandom(time(NULL)); \ + for (int times = 0; times < benchmark_iterations_; ++times) { \ + const int kWidth = (random() & 63) + 1; \ + const int kHeight = (random() & 31) + 1; \ + align_buffer_page_end(src_argb, (kWidth * BPP_A) * kHeight + kReadPad); \ + align_buffer_page_end(dst_argb_c, (kWidth * BPP_B) * kHeight); \ + align_buffer_page_end(dst_argb_opt, (kWidth * BPP_B) * kHeight); \ + for (int i = 0; i < kHeight * kWidth * BPP_A; ++i) { \ + src_argb[i] = (random() & 0xff); \ + } \ + MaskCpuFlags(kCpuInitialized); \ + FMT_A##To##FMT_B(src_argb, kWidth * STRIDE_A, \ + dst_argb_c, kWidth * BPP_B, \ + kWidth, kHeight); \ + MaskCpuFlags(-1); \ + FMT_A##To##FMT_B(src_argb, kWidth * STRIDE_A, \ + dst_argb_opt, kWidth * BPP_B, \ + kWidth, kHeight); \ + int max_diff = 0; \ + for (int i = 0; i < kHeight * kWidth * BPP_B; ++i) { \ + int abs_diff = \ + abs(static_cast<int>(dst_argb_c[i]) - \ + static_cast<int>(dst_argb_opt[i])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + EXPECT_LE(max_diff, 2); \ + free_aligned_buffer_page_end(src_argb) \ + free_aligned_buffer_page_end(dst_argb_c) \ + free_aligned_buffer_page_end(dst_argb_opt) \ + } \ +} + +TESTATOBRANDOM(ARGB, 4, 4, ARGB, 4) +TESTATOBRANDOM(ARGB, 4, 4, BGRA, 4) +TESTATOBRANDOM(ARGB, 4, 4, ABGR, 4) +TESTATOBRANDOM(ARGB, 4, 4, RGBA, 4) +TESTATOBRANDOM(ARGB, 4, 4, RAW, 3) +TESTATOBRANDOM(ARGB, 4, 4, RGB24, 3) +TESTATOBRANDOM(ARGB, 4, 4, RGB565, 2) +TESTATOBRANDOM(ARGB, 4, 4, ARGB1555, 2) +TESTATOBRANDOM(ARGB, 4, 4, ARGB4444, 2) + +TESTATOBRANDOM(BGRA, 4, 4, ARGB, 4) +TESTATOBRANDOM(ABGR, 4, 4, ARGB, 4) +TESTATOBRANDOM(RGBA, 4, 4, ARGB, 4) +TESTATOBRANDOM(RAW, 3, 3, ARGB, 4) +TESTATOBRANDOM(RGB24, 3, 3, ARGB, 4) +TESTATOBRANDOM(RGB565, 2, 2, ARGB, 4) +TESTATOBRANDOM(ARGB1555, 2, 2, ARGB, 4) +TESTATOBRANDOM(ARGB4444, 2, 2, ARGB, 4) + +TEST_F(libyuvTest, TestAttenuate) { + SIMD_ALIGNED(uint8 orig_pixels[256][4]); + SIMD_ALIGNED(uint8 atten_pixels[256][4]); + SIMD_ALIGNED(uint8 unatten_pixels[256][4]); + SIMD_ALIGNED(uint8 atten2_pixels[256][4]); + + // Test unattenuation clamps + orig_pixels[0][0] = 200u; + orig_pixels[0][1] = 129u; + orig_pixels[0][2] = 127u; + orig_pixels[0][3] = 128u; + // Test unattenuation transparent and opaque are unaffected + orig_pixels[1][0] = 16u; + orig_pixels[1][1] = 64u; + orig_pixels[1][2] = 192u; + orig_pixels[1][3] = 0u; + orig_pixels[2][0] = 16u; + orig_pixels[2][1] = 64u; + orig_pixels[2][2] = 192u; + orig_pixels[2][3] = 255u; + orig_pixels[3][0] = 16u; + orig_pixels[3][1] = 64u; + orig_pixels[3][2] = 192u; + orig_pixels[3][3] = 128u; + ARGBUnattenuate(&orig_pixels[0][0], 0, &unatten_pixels[0][0], 0, 4, 1); + EXPECT_EQ(255u, unatten_pixels[0][0]); + EXPECT_EQ(255u, unatten_pixels[0][1]); + EXPECT_EQ(254u, unatten_pixels[0][2]); + EXPECT_EQ(128u, unatten_pixels[0][3]); + EXPECT_EQ(16u, unatten_pixels[1][0]); + EXPECT_EQ(64u, unatten_pixels[1][1]); + EXPECT_EQ(192u, unatten_pixels[1][2]); + EXPECT_EQ(0u, unatten_pixels[1][3]); + EXPECT_EQ(16u, unatten_pixels[2][0]); + EXPECT_EQ(64u, unatten_pixels[2][1]); + EXPECT_EQ(192u, unatten_pixels[2][2]); + EXPECT_EQ(255u, unatten_pixels[2][3]); + EXPECT_EQ(32u, unatten_pixels[3][0]); + EXPECT_EQ(128u, unatten_pixels[3][1]); + EXPECT_EQ(255u, unatten_pixels[3][2]); + EXPECT_EQ(128u, unatten_pixels[3][3]); + + for (int i = 0; i < 256; ++i) { + orig_pixels[i][0] = i; + orig_pixels[i][1] = i / 2; + orig_pixels[i][2] = i / 3; + orig_pixels[i][3] = i; + } + ARGBAttenuate(&orig_pixels[0][0], 0, &atten_pixels[0][0], 0, 256, 1); + ARGBUnattenuate(&atten_pixels[0][0], 0, &unatten_pixels[0][0], 0, 256, 1); + for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) { + ARGBAttenuate(&unatten_pixels[0][0], 0, &atten2_pixels[0][0], 0, 256, 1); + } + for (int i = 0; i < 256; ++i) { + EXPECT_NEAR(atten_pixels[i][0], atten2_pixels[i][0], 2); + EXPECT_NEAR(atten_pixels[i][1], atten2_pixels[i][1], 2); + EXPECT_NEAR(atten_pixels[i][2], atten2_pixels[i][2], 2); + EXPECT_NEAR(atten_pixels[i][3], atten2_pixels[i][3], 2); + } + // Make sure transparent, 50% and opaque are fully accurate. + EXPECT_EQ(0, atten_pixels[0][0]); + EXPECT_EQ(0, atten_pixels[0][1]); + EXPECT_EQ(0, atten_pixels[0][2]); + EXPECT_EQ(0, atten_pixels[0][3]); + EXPECT_EQ(64, atten_pixels[128][0]); + EXPECT_EQ(32, atten_pixels[128][1]); + EXPECT_EQ(21, atten_pixels[128][2]); + EXPECT_EQ(128, atten_pixels[128][3]); + EXPECT_EQ(255, atten_pixels[255][0]); + EXPECT_EQ(127, atten_pixels[255][1]); + EXPECT_EQ(85, atten_pixels[255][2]); + EXPECT_EQ(255, atten_pixels[255][3]); +} + +TEST_F(libyuvTest, TestARGBComputeCumulativeSum) { + SIMD_ALIGNED(uint8 orig_pixels[16][16][4]); + SIMD_ALIGNED(int32 added_pixels[16][16][4]); + + for (int y = 0; y < 16; ++y) { + for (int x = 0; x < 16; ++x) { + orig_pixels[y][x][0] = 1u; + orig_pixels[y][x][1] = 2u; + orig_pixels[y][x][2] = 3u; + orig_pixels[y][x][3] = 255u; + } + } + + ARGBComputeCumulativeSum(&orig_pixels[0][0][0], 16 * 4, + &added_pixels[0][0][0], 16 * 4, + 16, 16); + + for (int y = 0; y < 16; ++y) { + for (int x = 0; x < 16; ++x) { + EXPECT_EQ((x + 1) * (y + 1), added_pixels[y][x][0]); + EXPECT_EQ((x + 1) * (y + 1) * 2, added_pixels[y][x][1]); + EXPECT_EQ((x + 1) * (y + 1) * 3, added_pixels[y][x][2]); + EXPECT_EQ((x + 1) * (y + 1) * 255, added_pixels[y][x][3]); + } + } +} + +TEST_F(libyuvTest, TestARGBGray) { + SIMD_ALIGNED(uint8 orig_pixels[256][4]); + + // Test blue + orig_pixels[0][0] = 255u; + orig_pixels[0][1] = 0u; + orig_pixels[0][2] = 0u; + orig_pixels[0][3] = 128u; + // Test green + orig_pixels[1][0] = 0u; + orig_pixels[1][1] = 255u; + orig_pixels[1][2] = 0u; + orig_pixels[1][3] = 0u; + // Test red + orig_pixels[2][0] = 0u; + orig_pixels[2][1] = 0u; + orig_pixels[2][2] = 255u; + orig_pixels[2][3] = 255u; + // Test color + orig_pixels[3][0] = 16u; + orig_pixels[3][1] = 64u; + orig_pixels[3][2] = 192u; + orig_pixels[3][3] = 224u; + // Do 16 to test asm version. + ARGBGray(&orig_pixels[0][0], 0, 0, 0, 16, 1); + EXPECT_EQ(27u, orig_pixels[0][0]); + EXPECT_EQ(27u, orig_pixels[0][1]); + EXPECT_EQ(27u, orig_pixels[0][2]); + EXPECT_EQ(128u, orig_pixels[0][3]); + EXPECT_EQ(151u, orig_pixels[1][0]); + EXPECT_EQ(151u, orig_pixels[1][1]); + EXPECT_EQ(151u, orig_pixels[1][2]); + EXPECT_EQ(0u, orig_pixels[1][3]); + EXPECT_EQ(75u, orig_pixels[2][0]); + EXPECT_EQ(75u, orig_pixels[2][1]); + EXPECT_EQ(75u, orig_pixels[2][2]); + EXPECT_EQ(255u, orig_pixels[2][3]); + EXPECT_EQ(96u, orig_pixels[3][0]); + EXPECT_EQ(96u, orig_pixels[3][1]); + EXPECT_EQ(96u, orig_pixels[3][2]); + EXPECT_EQ(224u, orig_pixels[3][3]); + + for (int i = 0; i < 256; ++i) { + orig_pixels[i][0] = i; + orig_pixels[i][1] = i / 2; + orig_pixels[i][2] = i / 3; + orig_pixels[i][3] = i; + } + + for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) { + ARGBGray(&orig_pixels[0][0], 0, 0, 0, 256, 1); + } +} + +TEST_F(libyuvTest, TestARGBGrayTo) { + SIMD_ALIGNED(uint8 orig_pixels[256][4]); + SIMD_ALIGNED(uint8 gray_pixels[256][4]); + + // Test blue + orig_pixels[0][0] = 255u; + orig_pixels[0][1] = 0u; + orig_pixels[0][2] = 0u; + orig_pixels[0][3] = 128u; + // Test green + orig_pixels[1][0] = 0u; + orig_pixels[1][1] = 255u; + orig_pixels[1][2] = 0u; + orig_pixels[1][3] = 0u; + // Test red + orig_pixels[2][0] = 0u; + orig_pixels[2][1] = 0u; + orig_pixels[2][2] = 255u; + orig_pixels[2][3] = 255u; + // Test color + orig_pixels[3][0] = 16u; + orig_pixels[3][1] = 64u; + orig_pixels[3][2] = 192u; + orig_pixels[3][3] = 224u; + // Do 16 to test asm version. + ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 16, 1); + EXPECT_EQ(27u, gray_pixels[0][0]); + EXPECT_EQ(27u, gray_pixels[0][1]); + EXPECT_EQ(27u, gray_pixels[0][2]); + EXPECT_EQ(128u, gray_pixels[0][3]); + EXPECT_EQ(151u, gray_pixels[1][0]); + EXPECT_EQ(151u, gray_pixels[1][1]); + EXPECT_EQ(151u, gray_pixels[1][2]); + EXPECT_EQ(0u, gray_pixels[1][3]); + EXPECT_EQ(75u, gray_pixels[2][0]); + EXPECT_EQ(75u, gray_pixels[2][1]); + EXPECT_EQ(75u, gray_pixels[2][2]); + EXPECT_EQ(255u, gray_pixels[2][3]); + EXPECT_EQ(96u, gray_pixels[3][0]); + EXPECT_EQ(96u, gray_pixels[3][1]); + EXPECT_EQ(96u, gray_pixels[3][2]); + EXPECT_EQ(224u, gray_pixels[3][3]); + + for (int i = 0; i < 256; ++i) { + orig_pixels[i][0] = i; + orig_pixels[i][1] = i / 2; + orig_pixels[i][2] = i / 3; + orig_pixels[i][3] = i; + } + + for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) { + ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 256, 1); + } +} + +TEST_F(libyuvTest, TestARGBSepia) { + SIMD_ALIGNED(uint8 orig_pixels[256][4]); + + // Test blue + orig_pixels[0][0] = 255u; + orig_pixels[0][1] = 0u; + orig_pixels[0][2] = 0u; + orig_pixels[0][3] = 128u; + // Test green + orig_pixels[1][0] = 0u; + orig_pixels[1][1] = 255u; + orig_pixels[1][2] = 0u; + orig_pixels[1][3] = 0u; + // Test red + orig_pixels[2][0] = 0u; + orig_pixels[2][1] = 0u; + orig_pixels[2][2] = 255u; + orig_pixels[2][3] = 255u; + // Test color + orig_pixels[3][0] = 16u; + orig_pixels[3][1] = 64u; + orig_pixels[3][2] = 192u; + orig_pixels[3][3] = 224u; + // Do 16 to test asm version. + ARGBSepia(&orig_pixels[0][0], 0, 0, 0, 16, 1); + EXPECT_EQ(33u, orig_pixels[0][0]); + EXPECT_EQ(43u, orig_pixels[0][1]); + EXPECT_EQ(47u, orig_pixels[0][2]); + EXPECT_EQ(128u, orig_pixels[0][3]); + EXPECT_EQ(135u, orig_pixels[1][0]); + EXPECT_EQ(175u, orig_pixels[1][1]); + EXPECT_EQ(195u, orig_pixels[1][2]); + EXPECT_EQ(0u, orig_pixels[1][3]); + EXPECT_EQ(69u, orig_pixels[2][0]); + EXPECT_EQ(89u, orig_pixels[2][1]); + EXPECT_EQ(99u, orig_pixels[2][2]); + EXPECT_EQ(255u, orig_pixels[2][3]); + EXPECT_EQ(88u, orig_pixels[3][0]); + EXPECT_EQ(114u, orig_pixels[3][1]); + EXPECT_EQ(127u, orig_pixels[3][2]); + EXPECT_EQ(224u, orig_pixels[3][3]); + + for (int i = 0; i < 256; ++i) { + orig_pixels[i][0] = i; + orig_pixels[i][1] = i / 2; + orig_pixels[i][2] = i / 3; + orig_pixels[i][3] = i; + } + + for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) { + ARGBSepia(&orig_pixels[0][0], 0, 0, 0, 256, 1); + } +} + +TEST_F(libyuvTest, TestARGBColorMatrix) { + SIMD_ALIGNED(uint8 orig_pixels[256][4]); + + // Matrix for Sepia. + static const int8 kARGBToSepia[] = { + 17, 68, 35, 0, + 22, 88, 45, 0, + 24, 98, 50, 0, + }; + + // Test blue + orig_pixels[0][0] = 255u; + orig_pixels[0][1] = 0u; + orig_pixels[0][2] = 0u; + orig_pixels[0][3] = 128u; + // Test green + orig_pixels[1][0] = 0u; + orig_pixels[1][1] = 255u; + orig_pixels[1][2] = 0u; + orig_pixels[1][3] = 0u; + // Test red + orig_pixels[2][0] = 0u; + orig_pixels[2][1] = 0u; + orig_pixels[2][2] = 255u; + orig_pixels[2][3] = 255u; + // Test color + orig_pixels[3][0] = 16u; + orig_pixels[3][1] = 64u; + orig_pixels[3][2] = 192u; + orig_pixels[3][3] = 224u; + // Do 16 to test asm version. + ARGBColorMatrix(&orig_pixels[0][0], 0, &kARGBToSepia[0], 0, 0, 16, 1); + EXPECT_EQ(33u, orig_pixels[0][0]); + EXPECT_EQ(43u, orig_pixels[0][1]); + EXPECT_EQ(47u, orig_pixels[0][2]); + EXPECT_EQ(128u, orig_pixels[0][3]); + EXPECT_EQ(135u, orig_pixels[1][0]); + EXPECT_EQ(175u, orig_pixels[1][1]); + EXPECT_EQ(195u, orig_pixels[1][2]); + EXPECT_EQ(0u, orig_pixels[1][3]); + EXPECT_EQ(69u, orig_pixels[2][0]); + EXPECT_EQ(89u, orig_pixels[2][1]); + EXPECT_EQ(99u, orig_pixels[2][2]); + EXPECT_EQ(255u, orig_pixels[2][3]); + EXPECT_EQ(88u, orig_pixels[3][0]); + EXPECT_EQ(114u, orig_pixels[3][1]); + EXPECT_EQ(127u, orig_pixels[3][2]); + EXPECT_EQ(224u, orig_pixels[3][3]); + + for (int i = 0; i < 256; ++i) { + orig_pixels[i][0] = i; + orig_pixels[i][1] = i / 2; + orig_pixels[i][2] = i / 3; + orig_pixels[i][3] = i; + } + + for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) { + ARGBColorMatrix(&orig_pixels[0][0], 0, &kARGBToSepia[0], 0, 0, 256, 1); + } +} + +TEST_F(libyuvTest, TestARGBColorTable) { + SIMD_ALIGNED(uint8 orig_pixels[256][4]); + memset(orig_pixels, 0, sizeof(orig_pixels)); + + // Matrix for Sepia. + static const uint8 kARGBTable[256 * 4] = { + 1u, 2u, 3u, 4u, + 5u, 6u, 7u, 8u, + 9u, 10u, 11u, 12u, + 13u, 14u, 15u, 16u, + }; + + orig_pixels[0][0] = 0u; + orig_pixels[0][1] = 0u; + orig_pixels[0][2] = 0u; + orig_pixels[0][3] = 0u; + orig_pixels[1][0] = 1u; + orig_pixels[1][1] = 1u; + orig_pixels[1][2] = 1u; + orig_pixels[1][3] = 1u; + orig_pixels[2][0] = 2u; + orig_pixels[2][1] = 2u; + orig_pixels[2][2] = 2u; + orig_pixels[2][3] = 2u; + orig_pixels[3][0] = 0u; + orig_pixels[3][1] = 1u; + orig_pixels[3][2] = 2u; + orig_pixels[3][3] = 3u; + // Do 16 to test asm version. + ARGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 16, 1); + EXPECT_EQ(1u, orig_pixels[0][0]); + EXPECT_EQ(2u, orig_pixels[0][1]); + EXPECT_EQ(3u, orig_pixels[0][2]); + EXPECT_EQ(4u, orig_pixels[0][3]); + EXPECT_EQ(5u, orig_pixels[1][0]); + EXPECT_EQ(6u, orig_pixels[1][1]); + EXPECT_EQ(7u, orig_pixels[1][2]); + EXPECT_EQ(8u, orig_pixels[1][3]); + EXPECT_EQ(9u, orig_pixels[2][0]); + EXPECT_EQ(10u, orig_pixels[2][1]); + EXPECT_EQ(11u, orig_pixels[2][2]); + EXPECT_EQ(12u, orig_pixels[2][3]); + EXPECT_EQ(1u, orig_pixels[3][0]); + EXPECT_EQ(6u, orig_pixels[3][1]); + EXPECT_EQ(11u, orig_pixels[3][2]); + EXPECT_EQ(16u, orig_pixels[3][3]); + + for (int i = 0; i < 256; ++i) { + orig_pixels[i][0] = i; + orig_pixels[i][1] = i / 2; + orig_pixels[i][2] = i / 3; + orig_pixels[i][3] = i; + } + + for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) { + ARGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 256, 1); + } +} + +TEST_F(libyuvTest, TestARGBQuantize) { + SIMD_ALIGNED(uint8 orig_pixels[256][4]); + + for (int i = 0; i < 256; ++i) { + orig_pixels[i][0] = i; + orig_pixels[i][1] = i / 2; + orig_pixels[i][2] = i / 3; + orig_pixels[i][3] = i; + } + ARGBQuantize(&orig_pixels[0][0], 0, + (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0, 256, 1); + + for (int i = 0; i < 256; ++i) { + EXPECT_EQ(i / 8 * 8 + 8 / 2, orig_pixels[i][0]); + EXPECT_EQ(i / 2 / 8 * 8 + 8 / 2, orig_pixels[i][1]); + EXPECT_EQ(i / 3 / 8 * 8 + 8 / 2, orig_pixels[i][2]); + EXPECT_EQ(i, orig_pixels[i][3]); + } + for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) { + ARGBQuantize(&orig_pixels[0][0], 0, + (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0, 256, 1); + } +} + +TEST_F(libyuvTest, TestARGBMirror) { + SIMD_ALIGNED(uint8 orig_pixels[256][4]); + SIMD_ALIGNED(uint8 dst_pixels[256][4]); + + for (int i = 0; i < 256; ++i) { + orig_pixels[i][0] = i; + orig_pixels[i][1] = i / 2; + orig_pixels[i][2] = i / 3; + orig_pixels[i][3] = i / 4; + } + ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 256, 1); + + for (int i = 0; i < 256; ++i) { + EXPECT_EQ(i, dst_pixels[255 - i][0]); + EXPECT_EQ(i / 2, dst_pixels[255 - i][1]); + EXPECT_EQ(i / 3, dst_pixels[255 - i][2]); + EXPECT_EQ(i / 4, dst_pixels[255 - i][3]); + } + for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) { + ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 256, 1); + } +} + +TEST_F(libyuvTest, TestShade) { + SIMD_ALIGNED(uint8 orig_pixels[256][4]); + SIMD_ALIGNED(uint8 shade_pixels[256][4]); + + orig_pixels[0][0] = 10u; + orig_pixels[0][1] = 20u; + orig_pixels[0][2] = 40u; + orig_pixels[0][3] = 80u; + orig_pixels[1][0] = 0u; + orig_pixels[1][1] = 0u; + orig_pixels[1][2] = 0u; + orig_pixels[1][3] = 255u; + orig_pixels[2][0] = 0u; + orig_pixels[2][1] = 0u; + orig_pixels[2][2] = 0u; + orig_pixels[2][3] = 0u; + orig_pixels[3][0] = 0u; + orig_pixels[3][1] = 0u; + orig_pixels[3][2] = 0u; + orig_pixels[3][3] = 0u; + ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 4, 1, 0x80ffffff); + EXPECT_EQ(10u, shade_pixels[0][0]); + EXPECT_EQ(20u, shade_pixels[0][1]); + EXPECT_EQ(40u, shade_pixels[0][2]); + EXPECT_EQ(40u, shade_pixels[0][3]); + EXPECT_EQ(0u, shade_pixels[1][0]); + EXPECT_EQ(0u, shade_pixels[1][1]); + EXPECT_EQ(0u, shade_pixels[1][2]); + EXPECT_EQ(128u, shade_pixels[1][3]); + EXPECT_EQ(0u, shade_pixels[2][0]); + EXPECT_EQ(0u, shade_pixels[2][1]); + EXPECT_EQ(0u, shade_pixels[2][2]); + EXPECT_EQ(0u, shade_pixels[2][3]); + EXPECT_EQ(0u, shade_pixels[3][0]); + EXPECT_EQ(0u, shade_pixels[3][1]); + EXPECT_EQ(0u, shade_pixels[3][2]); + EXPECT_EQ(0u, shade_pixels[3][3]); + + ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 4, 1, 0x80808080); + EXPECT_EQ(5u, shade_pixels[0][0]); + EXPECT_EQ(10u, shade_pixels[0][1]); + EXPECT_EQ(20u, shade_pixels[0][2]); + EXPECT_EQ(40u, shade_pixels[0][3]); + + for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) { + ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 256, 1, + 0x80808080); + } +} + +TEST_F(libyuvTest, TestInterpolate) { + SIMD_ALIGNED(uint8 orig_pixels_0[256][4]); + SIMD_ALIGNED(uint8 orig_pixels_1[256][4]); + SIMD_ALIGNED(uint8 interpolate_pixels[256][4]); + + orig_pixels_0[0][0] = 16u; + orig_pixels_0[0][1] = 32u; + orig_pixels_0[0][2] = 64u; + orig_pixels_0[0][3] = 128u; + orig_pixels_0[1][0] = 0u; + orig_pixels_0[1][1] = 0u; + orig_pixels_0[1][2] = 0u; + orig_pixels_0[1][3] = 255u; + orig_pixels_0[2][0] = 0u; + orig_pixels_0[2][1] = 0u; + orig_pixels_0[2][2] = 0u; + orig_pixels_0[2][3] = 0u; + orig_pixels_0[3][0] = 0u; + orig_pixels_0[3][1] = 0u; + orig_pixels_0[3][2] = 0u; + orig_pixels_0[3][3] = 0u; + + orig_pixels_1[0][0] = 0u; + orig_pixels_1[0][1] = 0u; + orig_pixels_1[0][2] = 0u; + orig_pixels_1[0][3] = 0u; + orig_pixels_1[1][0] = 0u; + orig_pixels_1[1][1] = 0u; + orig_pixels_1[1][2] = 0u; + orig_pixels_1[1][3] = 0u; + orig_pixels_1[2][0] = 0u; + orig_pixels_1[2][1] = 0u; + orig_pixels_1[2][2] = 0u; + orig_pixels_1[2][3] = 0u; + orig_pixels_1[3][0] = 255u; + orig_pixels_1[3][1] = 255u; + orig_pixels_1[3][2] = 255u; + orig_pixels_1[3][3] = 255u; + + ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0, + &interpolate_pixels[0][0], 0, 4, 1, 128); + EXPECT_EQ(8u, interpolate_pixels[0][0]); + EXPECT_EQ(16u, interpolate_pixels[0][1]); + EXPECT_EQ(32u, interpolate_pixels[0][2]); + EXPECT_EQ(64u, interpolate_pixels[0][3]); + EXPECT_EQ(0u, interpolate_pixels[1][0]); + EXPECT_EQ(0u, interpolate_pixels[1][1]); + EXPECT_EQ(0u, interpolate_pixels[1][2]); + EXPECT_NEAR(128u, interpolate_pixels[1][3], 1); // C = 127, SSE = 128. + EXPECT_EQ(0u, interpolate_pixels[2][0]); + EXPECT_EQ(0u, interpolate_pixels[2][1]); + EXPECT_EQ(0u, interpolate_pixels[2][2]); + EXPECT_EQ(0u, interpolate_pixels[2][3]); + EXPECT_NEAR(128u, interpolate_pixels[3][0], 1); + EXPECT_NEAR(128u, interpolate_pixels[3][1], 1); + EXPECT_NEAR(128u, interpolate_pixels[3][2], 1); + EXPECT_NEAR(128u, interpolate_pixels[3][3], 1); + + ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0, + &interpolate_pixels[0][0], 0, 4, 1, 0); + EXPECT_EQ(16u, interpolate_pixels[0][0]); + EXPECT_EQ(32u, interpolate_pixels[0][1]); + EXPECT_EQ(64u, interpolate_pixels[0][2]); + EXPECT_EQ(128u, interpolate_pixels[0][3]); + + ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0, + &interpolate_pixels[0][0], 0, 4, 1, 192); + + EXPECT_EQ(4u, interpolate_pixels[0][0]); + EXPECT_EQ(8u, interpolate_pixels[0][1]); + EXPECT_EQ(16u, interpolate_pixels[0][2]); + EXPECT_EQ(32u, interpolate_pixels[0][3]); + + for (int i = 0; i < benchmark_iterations_ * (1280 * 720 / 256); ++i) { + ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0, + &interpolate_pixels[0][0], 0, 256, 1, 128); + } +} + +TEST_F(libyuvTest, TestAffine) { + SIMD_ALIGNED(uint8 orig_pixels_0[256][4]); + SIMD_ALIGNED(uint8 interpolate_pixels_C[256][4]); +#if defined(HAS_ARGBAFFINEROW_SSE2) + SIMD_ALIGNED(uint8 interpolate_pixels_Opt[256][4]); +#endif + + for (int i = 0; i < 256; ++i) { + for (int j = 0; j < 4; ++j) { + orig_pixels_0[i][j] = i; + } + } + + float uv_step[4] = { 0.f, 0.f, 0.75f, 0.f }; + + ARGBAffineRow_C(&orig_pixels_0[0][0], 0, &interpolate_pixels_C[0][0], + uv_step, 256); + EXPECT_EQ(0u, interpolate_pixels_C[0][0]); + EXPECT_EQ(96u, interpolate_pixels_C[128][0]); + EXPECT_EQ(191u, interpolate_pixels_C[255][3]); + +#if defined(HAS_ARGBAFFINEROW_SSE2) + ARGBAffineRow_SSE2(&orig_pixels_0[0][0], 0, &interpolate_pixels_Opt[0][0], + uv_step, 256); + EXPECT_EQ(0, memcmp(interpolate_pixels_Opt, interpolate_pixels_C, 256 * 4)); +#endif + +#if defined(HAS_ARGBAFFINEROW_SSE2) + int has_sse2 = TestCpuFlag(kCpuHasSSE2); + if (has_sse2) { + for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) { + ARGBAffineRow_SSE2(&orig_pixels_0[0][0], 0, &interpolate_pixels_Opt[0][0], + uv_step, 256); + } + } else { +#endif + for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) { + ARGBAffineRow_C(&orig_pixels_0[0][0], 0, &interpolate_pixels_C[0][0], + uv_step, 256); + } +#if defined(HAS_ARGBAFFINEROW_SSE2) + } +#endif +} + +TEST_F(libyuvTest, Test565) { + SIMD_ALIGNED(uint8 orig_pixels[256][4]); + SIMD_ALIGNED(uint8 pixels565[256][2]); + + for (int i = 0; i < 256; ++i) { + for (int j = 0; j < 4; ++j) { + orig_pixels[i][j] = i; + } + } + ARGBToRGB565(&orig_pixels[0][0], 0, &pixels565[0][0], 0, 256, 1); + uint32 checksum = HashDjb2(&pixels565[0][0], sizeof(pixels565), 5381); + EXPECT_EQ(610919429u, checksum); +} + +} // namespace libyuv diff --git a/files/unit_test/rotate_argb_test.cc b/files/unit_test/rotate_argb_test.cc new file mode 100644 index 00000000..fe8435e1 --- /dev/null +++ b/files/unit_test/rotate_argb_test.cc @@ -0,0 +1,195 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stdlib.h> +#include <time.h> + +#include "libyuv/cpu_id.h" +#include "libyuv/rotate_argb.h" +#include "../unit_test/unit_test.h" + +namespace libyuv { + +static int ARGBTestRotate(int src_width, int src_height, + int dst_width, int dst_height, + libyuv::RotationMode mode, int runs) { + const int b = 128; + int src_argb_plane_size = (src_width + b * 2) * (src_height + b * 2) * 4; + int src_stride_argb = (b * 2 + src_width) * 4; + + align_buffer_16(src_argb, src_argb_plane_size) + memset(src_argb, 1, src_argb_plane_size); + + int dst_argb_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 4; + int dst_stride_argb = (b * 2 + dst_width) * 4; + + srandom(time(NULL)); + + int i, j; + for (i = b; i < (src_height + b); ++i) { + for (j = b; j < (src_width + b) * 4; ++j) { + src_argb[(i * src_stride_argb) + j] = (random() & 0xff); + } + } + + align_buffer_16(dst_argb_c, dst_argb_plane_size) + align_buffer_16(dst_argb_opt, dst_argb_plane_size) + memset(dst_argb_c, 2, dst_argb_plane_size); + memset(dst_argb_opt, 3, dst_argb_plane_size); + + // Warm up both versions for consistent benchmarks. + MaskCpuFlags(0); // Disable all CPU optimization. + ARGBRotate(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb, + dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb, + src_width, src_height, mode); + MaskCpuFlags(-1); // Enable all CPU optimization. + ARGBRotate(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb, + dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb, + src_width, src_height, mode); + + MaskCpuFlags(0); // Disable all CPU optimization. + double c_time = get_time(); + for (i = 0; i < runs; ++i) { + ARGBRotate(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb, + dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb, + src_width, src_height, mode); + } + c_time = (get_time() - c_time) / runs; + + MaskCpuFlags(-1); // Enable all CPU optimization. + double opt_time = get_time(); + for (i = 0; i < runs; ++i) { + ARGBRotate(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb, + dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb, + src_width, src_height, mode); + } + opt_time = (get_time() - opt_time) / runs; + + // Report performance of C vs OPT + printf("filter %d - %8d us C - %8d us OPT\n", + mode, static_cast<int>(c_time*1e6), static_cast<int>(opt_time*1e6)); + + // C version may be a little off from the optimized. Order of + // operations may introduce rounding somewhere. So do a difference + // of the buffers and look to see that the max difference isn't + // over 2. + int max_diff = 0; + for (i = b; i < (dst_height + b); ++i) { + for (j = b * 4; j < (dst_width + b) * 4; ++j) { + int abs_diff = abs(dst_argb_c[(i * dst_stride_argb) + j] - + dst_argb_opt[(i * dst_stride_argb) + j]); + if (abs_diff > max_diff) + max_diff = abs_diff; + } + } + + free_aligned_buffer_16(dst_argb_c) + free_aligned_buffer_16(dst_argb_opt) + free_aligned_buffer_16(src_argb) + return max_diff; +} + +TEST_F(libyuvTest, ARGBRotate0) { + const int src_width = 1280; + const int src_height = 720; + const int dst_width = 1280; + const int dst_height = 720; + + int err = ARGBTestRotate(src_width, src_height, + dst_width, dst_height, kRotate0, + benchmark_iterations_); + EXPECT_GE(1, err); +} + +TEST_F(libyuvTest, ARGBRotate90) { + const int src_width = 1280; + const int src_height = 720; + const int dst_width = 720; + const int dst_height = 1280; + + int err = ARGBTestRotate(src_width, src_height, + dst_width, dst_height, kRotate90, + benchmark_iterations_); + EXPECT_GE(1, err); +} + +TEST_F(libyuvTest, ARGBRotate180) { + const int src_width = 1280; + const int src_height = 720; + const int dst_width = 1280; + const int dst_height = 720; + + int err = ARGBTestRotate(src_width, src_height, + dst_width, dst_height, kRotate180, + benchmark_iterations_); + EXPECT_GE(1, err); +} + +TEST_F(libyuvTest, ARGBRotate270) { + const int src_width = 1280; + const int src_height = 720; + const int dst_width = 720; + const int dst_height = 1280; + + int err = ARGBTestRotate(src_width, src_height, + dst_width, dst_height, kRotate270, + benchmark_iterations_); + EXPECT_GE(1, err); +} + +TEST_F(libyuvTest, ARGBRotate0_Odd) { + const int src_width = 1277; + const int src_height = 719; + const int dst_width = 1277; + const int dst_height = 719; + + int err = ARGBTestRotate(src_width, src_height, + dst_width, dst_height, kRotate0, + benchmark_iterations_); + EXPECT_GE(1, err); +} + +TEST_F(libyuvTest, ARGBRotate90_Odd) { + const int src_width = 1277; + const int src_height = 719; + const int dst_width = 719; + const int dst_height = 1277; + + int err = ARGBTestRotate(src_width, src_height, + dst_width, dst_height, kRotate90, + benchmark_iterations_); + EXPECT_GE(1, err); +} + +TEST_F(libyuvTest, ARGBRotate180_Odd) { + const int src_width = 1277; + const int src_height = 719; + const int dst_width = 1277; + const int dst_height = 719; + + int err = ARGBTestRotate(src_width, src_height, + dst_width, dst_height, kRotate180, + benchmark_iterations_); + EXPECT_GE(1, err); +} + +TEST_F(libyuvTest, ARGBRotate270_Odd) { + const int src_width = 1277; + const int src_height = 719; + const int dst_width = 719; + const int dst_height = 1277; + + int err = ARGBTestRotate(src_width, src_height, + dst_width, dst_height, kRotate270, + benchmark_iterations_); + EXPECT_GE(1, err); +} + +} // namespace libyuv diff --git a/files/unit_test/rotate_test.cc b/files/unit_test/rotate_test.cc index 1c295b08..788e511e 100644 --- a/files/unit_test/rotate_test.cc +++ b/files/unit_test/rotate_test.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -8,21 +8,19 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "libyuv/rotate.h" -#include "../source/rotate_priv.h" -#include "unit_test.h" #include <stdlib.h> #include <time.h> -using namespace libyuv; - -void print_array(uint8 *array, int w, int h) { - int i, j; +#include "libyuv/rotate.h" +#include "../unit_test/unit_test.h" - for (i = 0; i < h; ++i) { - for (j = 0; j < w; ++j) - printf("%4d", (signed char)array[(i * w) + j]); +namespace libyuv { +void PrintArray(uint8 *array, int w, int h) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + printf("%4d", (signed char)array[i * w + j]); + } printf("\n"); } } @@ -31,46 +29,45 @@ TEST_F(libyuvTest, Transpose) { int iw, ih, ow, oh; int err = 0; - for (iw = 8; iw < _rotate_max_w && !err; ++iw) - for (ih = 8; ih < _rotate_max_h && !err; ++ih) { + for (iw = 8; iw < rotate_max_w_ && !err; ++iw) { + for (ih = 8; ih < rotate_max_h_ && !err; ++ih) { int i; - uint8 *input; - uint8 *output_1; - uint8 *output_2; - ow = ih; oh = iw; - input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); - output_1 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); - output_2 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); + align_buffer_16(input, iw * ih) + align_buffer_16(output_1, ow * oh) + align_buffer_16(output_2, iw * ih) - for (i = 0; i < (iw * ih); ++i) + for (i = 0; i < iw * ih; ++i) { input[i] = i; + } TransposePlane(input, iw, output_1, ow, iw, ih); TransposePlane(output_1, ow, output_2, oh, ow, oh); - for (i = 0; i < (iw * ih); ++i) { - if (input[i] != output_2[i]) + for (i = 0; i < iw * ih; ++i) { + if (input[i] != output_2[i]) { err++; + } } if (err) { printf("input %dx%d \n", iw, ih); - print_array(input, iw, ih); + PrintArray(input, iw, ih); printf("transpose 1\n"); - print_array(output_1, ow, oh); + PrintArray(output_1, ow, oh); printf("transpose 2\n"); - print_array(output_2, iw, ih); + PrintArray(output_2, iw, ih); } - free(input); - free(output_1); - free(output_2); + free_aligned_buffer_16(input) + free_aligned_buffer_16(output_1) + free_aligned_buffer_16(output_2) } + } EXPECT_EQ(0, err); } @@ -79,23 +76,20 @@ TEST_F(libyuvTest, TransposeUV) { int iw, ih, ow, oh; int err = 0; - for (iw = 16; iw < _rotate_max_w && !err; iw += 2) - for (ih = 8; ih < _rotate_max_h && !err; ++ih) { + for (iw = 16; iw < rotate_max_w_ && !err; iw += 2) { + for (ih = 8; ih < rotate_max_h_ && !err; ++ih) { int i; - uint8 *input; - uint8 *output_a1, *output_b1; - uint8 *output_a2, *output_b2; ow = ih; oh = iw >> 1; - input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); - output_a1 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); - output_b1 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); - output_a2 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); - output_b2 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); + align_buffer_16(input, iw * ih) + align_buffer_16(output_a1, ow * oh) + align_buffer_16(output_b1, ow * oh) + align_buffer_16(output_a2, iw * ih) + align_buffer_16(output_b2, iw * ih) - for (i = 0; i < (iw * ih); i += 2) { + for (i = 0; i < iw * ih; i += 2) { input[i] = i >> 1; input[i + 1] = -(i >> 1); } @@ -105,32 +99,35 @@ TEST_F(libyuvTest, TransposeUV) { TransposePlane(output_a1, ow, output_a2, oh, ow, oh); TransposePlane(output_b1, ow, output_b2, oh, ow, oh); - for (i = 0; i < (iw * ih); i += 2) { - if (input[i] != output_a2[i >> 1]) + for (i = 0; i < iw * ih; i += 2) { + if (input[i] != output_a2[i >> 1]) { err++; - if (input[i + 1] != output_b2[i >> 1]) + } + if (input[i + 1] != output_b2[i >> 1]) { err++; + } } if (err) { printf("input %dx%d \n", iw, ih); - print_array(input, iw, ih); + PrintArray(input, iw, ih); printf("transpose 1\n"); - print_array(output_a1, ow, oh); - print_array(output_b1, ow, oh); + PrintArray(output_a1, ow, oh); + PrintArray(output_b1, ow, oh); printf("transpose 2\n"); - print_array(output_a2, oh, ow); - print_array(output_b2, oh, ow); + PrintArray(output_a2, oh, ow); + PrintArray(output_b2, oh, ow); } - free(input); - free(output_a1); - free(output_b1); - free(output_a2); - free(output_b2); + free_aligned_buffer_16(input) + free_aligned_buffer_16(output_a1) + free_aligned_buffer_16(output_b1) + free_aligned_buffer_16(output_a2) + free_aligned_buffer_16(output_b2) } + } EXPECT_EQ(0, err); } @@ -139,60 +136,58 @@ TEST_F(libyuvTest, RotatePlane90) { int iw, ih, ow, oh; int err = 0; - for (iw = 8; iw < _rotate_max_w && !err; ++iw) - for (ih = 8; ih < _rotate_max_h && !err; ++ih) { + for (iw = 8; iw < rotate_max_w_ && !err; ++iw) { + for (ih = 8; ih < rotate_max_h_ && !err; ++ih) { int i; - uint8 *input; - uint8 *output_0; - uint8 *output_90; - uint8 *output_180; - uint8 *output_270; ow = ih; oh = iw; - input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); - output_0 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); - output_90 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); - output_180 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); - output_270 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + align_buffer_16(input, iw * ih) + align_buffer_16(output_0, iw * ih) + align_buffer_16(output_90, ow * oh) + align_buffer_16(output_180, iw * ih) + align_buffer_16(output_270, ow * oh) - for (i = 0; i < (iw * ih); ++i) + for (i = 0; i < iw * ih; ++i) { input[i] = i; + } RotatePlane90(input, iw, output_90, ow, iw, ih); RotatePlane90(output_90, ow, output_180, oh, ow, oh); RotatePlane90(output_180, oh, output_270, ow, oh, ow); RotatePlane90(output_270, ow, output_0, iw, ow, oh); - for (i = 0; i < (iw * ih); ++i) { - if (input[i] != output_0[i]) + for (i = 0; i < iw * ih; ++i) { + if (input[i] != output_0[i]) { err++; + } } if (err) { printf("input %dx%d \n", iw, ih); - print_array(input, iw, ih); + PrintArray(input, iw, ih); printf("output 90\n"); - print_array(output_90, ow, oh); + PrintArray(output_90, ow, oh); printf("output 180\n"); - print_array(output_180, iw, ih); + PrintArray(output_180, iw, ih); printf("output 270\n"); - print_array(output_270, ow, oh); + PrintArray(output_270, ow, oh); printf("output 0\n"); - print_array(output_0, iw, ih); + PrintArray(output_0, iw, ih); } - free(input); - free(output_0); - free(output_90); - free(output_180); - free(output_270); + free_aligned_buffer_16(input) + free_aligned_buffer_16(output_0) + free_aligned_buffer_16(output_90) + free_aligned_buffer_16(output_180) + free_aligned_buffer_16(output_270) } + } EXPECT_EQ(0, err); } @@ -201,29 +196,22 @@ TEST_F(libyuvTest, RotateUV90) { int iw, ih, ow, oh; int err = 0; - for (iw = 16; iw < _rotate_max_w && !err; iw += 2) - for (ih = 8; ih < _rotate_max_h && !err; ++ih) { + for (iw = 16; iw < rotate_max_w_ && !err; iw += 2) { + for (ih = 8; ih < rotate_max_h_ && !err; ++ih) { int i; - uint8 *input; - uint8 *output_0_u; - uint8 *output_0_v; - uint8 *output_90_u; - uint8 *output_90_v; - uint8 *output_180_u; - uint8 *output_180_v; ow = ih; oh = iw >> 1; - input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); - output_0_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); - output_0_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); - output_90_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); - output_90_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); - output_180_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); - output_180_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + align_buffer_16(input, iw * ih) + align_buffer_16(output_0_u, ow * oh) + align_buffer_16(output_0_v, ow * oh) + align_buffer_16(output_90_u, ow * oh) + align_buffer_16(output_90_v, ow * oh) + align_buffer_16(output_180_u, ow * oh) + align_buffer_16(output_180_v, ow * oh) - for (i = 0; i < (iw * ih); i += 2) { + for (i = 0; i < iw * ih; i += 2) { input[i] = i >> 1; input[i + 1] = -(i >> 1); } @@ -237,43 +225,46 @@ TEST_F(libyuvTest, RotateUV90) { RotatePlane180(output_180_v, ow, output_0_v, ow, ow, oh); for (i = 0; i < (ow * oh); ++i) { - if (output_0_u[i] != (uint8)i) + if (output_0_u[i] != (uint8)i) { err++; - if (output_0_v[i] != (uint8)(-i)) + } + if (output_0_v[i] != (uint8)(-i)) { err++; + } } if (err) { printf("input %dx%d \n", iw, ih); - print_array(input, iw, ih); + PrintArray(input, iw, ih); printf("output 90_u\n"); - print_array(output_90_u, ow, oh); + PrintArray(output_90_u, ow, oh); printf("output 90_v\n"); - print_array(output_90_v, ow, oh); + PrintArray(output_90_v, ow, oh); printf("output 180_u\n"); - print_array(output_180_u, oh, ow); + PrintArray(output_180_u, oh, ow); printf("output 180_v\n"); - print_array(output_180_v, oh, ow); + PrintArray(output_180_v, oh, ow); printf("output 0_u\n"); - print_array(output_0_u, oh, ow); + PrintArray(output_0_u, oh, ow); printf("output 0_v\n"); - print_array(output_0_v, oh, ow); + PrintArray(output_0_v, oh, ow); } - free(input); - free(output_0_u); - free(output_0_v); - free(output_90_u); - free(output_90_v); - free(output_180_u); - free(output_180_v); + free_aligned_buffer_16(input) + free_aligned_buffer_16(output_0_u) + free_aligned_buffer_16(output_0_v) + free_aligned_buffer_16(output_90_u) + free_aligned_buffer_16(output_90_v) + free_aligned_buffer_16(output_180_u) + free_aligned_buffer_16(output_180_v) } + } EXPECT_EQ(0, err); } @@ -282,29 +273,22 @@ TEST_F(libyuvTest, RotateUV180) { int iw, ih, ow, oh; int err = 0; - for (iw = 16; iw < _rotate_max_w && !err; iw += 2) - for (ih = 8; ih < _rotate_max_h && !err; ++ih) { + for (iw = 16; iw < rotate_max_w_ && !err; iw += 2) { + for (ih = 8; ih < rotate_max_h_ && !err; ++ih) { int i; - uint8 *input; - uint8 *output_0_u; - uint8 *output_0_v; - uint8 *output_90_u; - uint8 *output_90_v; - uint8 *output_180_u; - uint8 *output_180_v; ow = iw >> 1; oh = ih; - input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); - output_0_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); - output_0_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); - output_90_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); - output_90_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); - output_180_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); - output_180_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + align_buffer_16(input, iw * ih) + align_buffer_16(output_0_u, ow * oh) + align_buffer_16(output_0_v, ow * oh) + align_buffer_16(output_90_u, ow * oh) + align_buffer_16(output_90_v, ow * oh) + align_buffer_16(output_180_u, ow * oh) + align_buffer_16(output_180_v, ow * oh) - for (i = 0; i < (iw * ih); i += 2) { + for (i = 0; i < iw * ih; i += 2) { input[i] = i >> 1; input[i + 1] = -(i >> 1); } @@ -318,43 +302,46 @@ TEST_F(libyuvTest, RotateUV180) { RotatePlane90(output_90_v, oh, output_0_v, ow, oh, ow); for (i = 0; i < (ow * oh); ++i) { - if (output_0_u[i] != (uint8)i) + if (output_0_u[i] != (uint8)i) { err++; - if (output_0_v[i] != (uint8)(-i)) + } + if (output_0_v[i] != (uint8)(-i)) { err++; + } } if (err) { printf("input %dx%d \n", iw, ih); - print_array(input, iw, ih); + PrintArray(input, iw, ih); printf("output 180_u\n"); - print_array(output_180_u, oh, ow); + PrintArray(output_180_u, oh, ow); printf("output 180_v\n"); - print_array(output_180_v, oh, ow); + PrintArray(output_180_v, oh, ow); printf("output 90_u\n"); - print_array(output_90_u, oh, ow); + PrintArray(output_90_u, oh, ow); printf("output 90_v\n"); - print_array(output_90_v, oh, ow); + PrintArray(output_90_v, oh, ow); printf("output 0_u\n"); - print_array(output_0_u, ow, oh); + PrintArray(output_0_u, ow, oh); printf("output 0_v\n"); - print_array(output_0_v, ow, oh); + PrintArray(output_0_v, ow, oh); } - free(input); - free(output_0_u); - free(output_0_v); - free(output_90_u); - free(output_90_v); - free(output_180_u); - free(output_180_v); + free_aligned_buffer_16(input) + free_aligned_buffer_16(output_0_u) + free_aligned_buffer_16(output_0_v) + free_aligned_buffer_16(output_90_u) + free_aligned_buffer_16(output_90_v) + free_aligned_buffer_16(output_180_u) + free_aligned_buffer_16(output_180_v) } + } EXPECT_EQ(0, err); } @@ -363,29 +350,22 @@ TEST_F(libyuvTest, RotateUV270) { int iw, ih, ow, oh; int err = 0; - for (iw = 16; iw < _rotate_max_w && !err; iw += 2) - for (ih = 8; ih < _rotate_max_h && !err; ++ih) { + for (iw = 16; iw < rotate_max_w_ && !err; iw += 2) { + for (ih = 8; ih < rotate_max_h_ && !err; ++ih) { int i; - uint8 *input; - uint8 *output_0_u; - uint8 *output_0_v; - uint8 *output_270_u; - uint8 *output_270_v; - uint8 *output_180_u; - uint8 *output_180_v; ow = ih; oh = iw >> 1; - input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); - output_0_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); - output_0_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); - output_270_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); - output_270_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); - output_180_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); - output_180_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + align_buffer_16(input, iw * ih) + align_buffer_16(output_0_u, ow * oh) + align_buffer_16(output_0_v, ow * oh) + align_buffer_16(output_270_u, ow * oh) + align_buffer_16(output_270_v, ow * oh) + align_buffer_16(output_180_u, ow * oh) + align_buffer_16(output_180_v, ow * oh) - for (i = 0; i < (iw * ih); i += 2) { + for (i = 0; i < iw * ih; i += 2) { input[i] = i >> 1; input[i + 1] = -(i >> 1); } @@ -400,43 +380,46 @@ TEST_F(libyuvTest, RotateUV270) { RotatePlane180(output_180_v, ow, output_0_v, ow, ow, oh); for (i = 0; i < (ow * oh); ++i) { - if (output_0_u[i] != (uint8)i) + if (output_0_u[i] != (uint8)i) { err++; - if (output_0_v[i] != (uint8)(-i)) + } + if (output_0_v[i] != (uint8)(-i)) { err++; + } } if (err) { printf("input %dx%d \n", iw, ih); - print_array(input, iw, ih); + PrintArray(input, iw, ih); printf("output 270_u\n"); - print_array(output_270_u, ow, oh); + PrintArray(output_270_u, ow, oh); printf("output 270_v\n"); - print_array(output_270_v, ow, oh); + PrintArray(output_270_v, ow, oh); printf("output 180_u\n"); - print_array(output_180_u, oh, ow); + PrintArray(output_180_u, oh, ow); printf("output 180_v\n"); - print_array(output_180_v, oh, ow); + PrintArray(output_180_v, oh, ow); printf("output 0_u\n"); - print_array(output_0_u, oh, ow); + PrintArray(output_0_u, oh, ow); printf("output 0_v\n"); - print_array(output_0_v, oh, ow); + PrintArray(output_0_v, oh, ow); } - free(input); - free(output_0_u); - free(output_0_v); - free(output_270_u); - free(output_270_v); - free(output_180_u); - free(output_180_v); + free_aligned_buffer_16(input) + free_aligned_buffer_16(output_0_u) + free_aligned_buffer_16(output_0_v) + free_aligned_buffer_16(output_270_u) + free_aligned_buffer_16(output_270_v) + free_aligned_buffer_16(output_180_u) + free_aligned_buffer_16(output_180_v) } + } EXPECT_EQ(0, err); } @@ -445,45 +428,44 @@ TEST_F(libyuvTest, RotatePlane180) { int iw, ih, ow, oh; int err = 0; - for (iw = 8; iw < _rotate_max_w && !err; ++iw) - for (ih = 8; ih < _rotate_max_h && !err; ++ih) { + for (iw = 8; iw < rotate_max_w_ && !err; ++iw) + for (ih = 8; ih < rotate_max_h_ && !err; ++ih) { int i; - uint8 *input; - uint8 *output_0; - uint8 *output_180; ow = iw; oh = ih; - input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); - output_0 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); - output_180 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); + align_buffer_16(input, iw * ih) + align_buffer_16(output_0, iw * ih) + align_buffer_16(output_180, iw * ih) - for (i = 0; i < (iw * ih); ++i) + for (i = 0; i < iw * ih; ++i) { input[i] = i; + } RotatePlane180(input, iw, output_180, ow, iw, ih); RotatePlane180(output_180, ow, output_0, iw, ow, oh); - for (i = 0; i < (iw * ih); ++i) { - if (input[i] != output_0[i]) + for (i = 0; i < iw * ih; ++i) { + if (input[i] != output_0[i]) { err++; + } } if (err) { printf("input %dx%d \n", iw, ih); - print_array(input, iw, ih); + PrintArray(input, iw, ih); printf("output 180\n"); - print_array(output_180, iw, ih); + PrintArray(output_180, iw, ih); printf("output 0\n"); - print_array(output_0, iw, ih); + PrintArray(output_0, iw, ih); } - free(input); - free(output_0); - free(output_180); + free_aligned_buffer_16(input) + free_aligned_buffer_16(output_0) + free_aligned_buffer_16(output_180) } EXPECT_EQ(0, err); @@ -493,25 +475,20 @@ TEST_F(libyuvTest, RotatePlane270) { int iw, ih, ow, oh; int err = 0; - for (iw = 8; iw < _rotate_max_w && !err; ++iw) - for (ih = 8; ih < _rotate_max_h && !err; ++ih) { + for (iw = 8; iw < rotate_max_w_ && !err; ++iw) { + for (ih = 8; ih < rotate_max_h_ && !err; ++ih) { int i; - uint8 *input; - uint8 *output_0; - uint8 *output_90; - uint8 *output_180; - uint8 *output_270; ow = ih; oh = iw; - input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); - output_0 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); - output_90 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); - output_180 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); - output_270 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + align_buffer_16(input, iw * ih) + align_buffer_16(output_0, iw * ih) + align_buffer_16(output_90, ow * oh) + align_buffer_16(output_180, iw * ih) + align_buffer_16(output_270, ow * oh) - for (i = 0; i < (iw * ih); ++i) + for (i = 0; i < iw * ih; ++i) input[i] = i; RotatePlane270(input, iw, output_270, ow, iw, ih); @@ -519,34 +496,36 @@ TEST_F(libyuvTest, RotatePlane270) { RotatePlane270(output_180, oh, output_90, ow, oh, ow); RotatePlane270(output_90, ow, output_0, iw, ow, oh); - for (i = 0; i < (iw * ih); ++i) { - if (input[i] != output_0[i]) + for (i = 0; i < iw * ih; ++i) { + if (input[i] != output_0[i]) { err++; + } } if (err) { printf("input %dx%d \n", iw, ih); - print_array(input, iw, ih); + PrintArray(input, iw, ih); printf("output 270\n"); - print_array(output_270, ow, oh); + PrintArray(output_270, ow, oh); printf("output 180\n"); - print_array(output_180, iw, ih); + PrintArray(output_180, iw, ih); printf("output 90\n"); - print_array(output_90, ow, oh); + PrintArray(output_90, ow, oh); printf("output 0\n"); - print_array(output_0, iw, ih); + PrintArray(output_0, iw, ih); } - free(input); - free(output_0); - free(output_90); - free(output_180); - free(output_270); + free_aligned_buffer_16(input) + free_aligned_buffer_16(output_0) + free_aligned_buffer_16(output_90) + free_aligned_buffer_16(output_180) + free_aligned_buffer_16(output_270) } + } EXPECT_EQ(0, err); } @@ -555,44 +534,44 @@ TEST_F(libyuvTest, RotatePlane90and270) { int iw, ih, ow, oh; int err = 0; - for (iw = 16; iw < _rotate_max_w && !err; iw += 4) - for (ih = 16; ih < _rotate_max_h && !err; ih += 4) { + for (iw = 16; iw < rotate_max_w_ && !err; iw += 4) + for (ih = 16; ih < rotate_max_h_ && !err; ih += 4) { int i; - uint8 *input; - uint8 *output_0; - uint8 *output_90; + ow = ih; oh = iw; - input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); - output_0 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); - output_90 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + align_buffer_16(input, iw * ih) + align_buffer_16(output_0, iw * ih) + align_buffer_16(output_90, ow * oh) - for (i = 0; i < (iw * ih); ++i) + for (i = 0; i < iw * ih; ++i) { input[i] = i; + } RotatePlane90(input, iw, output_90, ow, iw, ih); RotatePlane270(output_90, ow, output_0, iw, ow, oh); - for (i = 0; i < (iw * ih); ++i) { - if (input[i] != output_0[i]) + for (i = 0; i < iw * ih; ++i) { + if (input[i] != output_0[i]) { err++; + } } if (err) { printf("intput %dx%d\n", iw, ih); - print_array(input, iw, ih); + PrintArray(input, iw, ih); printf("output \n"); - print_array(output_90, ow, oh); + PrintArray(output_90, ow, oh); printf("output \n"); - print_array(output_0, iw, ih); + PrintArray(output_0, iw, ih); } - free(input); - free(output_0); - free(output_90); + free_aligned_buffer_16(input) + free_aligned_buffer_16(output_0) + free_aligned_buffer_16(output_90) } EXPECT_EQ(0, err); @@ -602,21 +581,20 @@ TEST_F(libyuvTest, RotatePlane90Pitch) { int iw, ih; int err = 0; - for (iw = 16; iw < _rotate_max_w && !err; iw += 4) - for (ih = 16; ih < _rotate_max_h && !err; ih += 4) { + for (iw = 16; iw < rotate_max_w_ && !err; iw += 4) + for (ih = 16; ih < rotate_max_h_ && !err; ih += 4) { int i; - uint8 *input; - uint8 *output_0; - uint8 *output_90; + int ow = ih; int oh = iw; - input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); - output_0 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); - output_90 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + align_buffer_16(input, iw * ih) + align_buffer_16(output_0, iw * ih) + align_buffer_16(output_90, ow * oh) - for (i = 0; i < (iw * ih); ++i) + for (i = 0; i < iw * ih; ++i) { input[i] = i; + } RotatePlane90(input, iw, output_90 + (ow >> 1), ow, @@ -633,25 +611,26 @@ TEST_F(libyuvTest, RotatePlane90Pitch) { RotatePlane270(output_90, ih, output_0, iw, ow, oh); - for (i = 0; i < (iw * ih); ++i) { - if (input[i] != output_0[i]) + for (i = 0; i < iw * ih; ++i) { + if (input[i] != output_0[i]) { err++; + } } if (err) { printf("intput %dx%d\n", iw, ih); - print_array(input, iw, ih); + PrintArray(input, iw, ih); printf("output \n"); - print_array(output_90, ow, oh); + PrintArray(output_90, ow, oh); printf("output \n"); - print_array(output_0, iw, ih); + PrintArray(output_0, iw, ih); } - free(input); - free(output_0); - free(output_90); + free_aligned_buffer_16(input) + free_aligned_buffer_16(output_0) + free_aligned_buffer_16(output_90) } EXPECT_EQ(0, err); @@ -661,22 +640,20 @@ TEST_F(libyuvTest, RotatePlane270Pitch) { int iw, ih, ow, oh; int err = 0; - for (iw = 16; iw < _rotate_max_w && !err; iw += 4) - for (ih = 16; ih < _rotate_max_h && !err; ih += 4) { + for (iw = 16; iw < rotate_max_w_ && !err; iw += 4) { + for (ih = 16; ih < rotate_max_h_ && !err; ih += 4) { int i; - uint8 *input; - uint8 *output_0; - uint8 *output_270; ow = ih; oh = iw; - input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); - output_0 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); - output_270 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + align_buffer_16(input, iw * ih) + align_buffer_16(output_0, iw * ih) + align_buffer_16(output_270, ow * oh) - for (i = 0; i < (iw * ih); ++i) + for (i = 0; i < iw * ih; ++i) { input[i] = i; + } RotatePlane270(input, iw, output_270 + ow * (oh >> 1), ow, @@ -693,36 +670,34 @@ TEST_F(libyuvTest, RotatePlane270Pitch) { RotatePlane90(output_270, ih, output_0, iw, ow, oh); - for (i = 0; i < (iw * ih); ++i) { - if (input[i] != output_0[i]) + for (i = 0; i < iw * ih; ++i) { + if (input[i] != output_0[i]) { err++; + } } if (err) { printf("intput %dx%d\n", iw, ih); - print_array(input, iw, ih); + PrintArray(input, iw, ih); printf("output \n"); - print_array(output_270, ow, oh); + PrintArray(output_270, ow, oh); printf("output \n"); - print_array(output_0, iw, ih); + PrintArray(output_0, iw, ih); } - free(input); - free(output_0); - free(output_270); + free_aligned_buffer_16(input) + free_aligned_buffer_16(output_0) + free_aligned_buffer_16(output_270) } + } EXPECT_EQ(0, err); } TEST_F(libyuvTest, I420Rotate90) { int err = 0; - uint8 *orig_y, *orig_u, *orig_v; - uint8 *ro0_y, *ro0_u, *ro0_v; - uint8 *ro90_y, *ro90_u, *ro90_v; - uint8 *ro270_y, *ro270_u, *ro270_v; int yw = 1024; int yh = 768; @@ -732,50 +707,59 @@ TEST_F(libyuvTest, I420Rotate90) { int i, j; - int y_plane_size = (yw + (2 * b)) * (yh + (2 * b)); - int uv_plane_size = (uvw + (2 * b)) * (uvh + (2 * b)); + int y_plane_size = (yw + b * 2) * (yh + b * 2); + int uv_plane_size = (uvw + b * 2) * (uvh + b * 2); srandom(time(NULL)); - orig_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); - orig_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); - orig_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); - - ro0_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); - ro0_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); - ro0_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); - - ro90_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); - ro90_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); - ro90_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); - - ro270_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); - ro270_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); - ro270_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + align_buffer_16(orig_y, y_plane_size) + align_buffer_16(orig_u, uv_plane_size) + align_buffer_16(orig_v, uv_plane_size) + align_buffer_16(ro0_y, y_plane_size) + align_buffer_16(ro0_u, uv_plane_size) + align_buffer_16(ro0_v, uv_plane_size) + align_buffer_16(ro90_y, y_plane_size) + align_buffer_16(ro90_u, uv_plane_size) + align_buffer_16(ro90_v, uv_plane_size) + align_buffer_16(ro270_y, y_plane_size) + align_buffer_16(ro270_u, uv_plane_size) + align_buffer_16(ro270_v, uv_plane_size) + memset(orig_y, 0, y_plane_size); + memset(orig_u, 0, uv_plane_size); + memset(orig_v, 0, uv_plane_size); + memset(ro0_y, 0, y_plane_size); + memset(ro0_u, 0, uv_plane_size); + memset(ro0_v, 0, uv_plane_size); + memset(ro90_y, 0, y_plane_size); + memset(ro90_u, 0, uv_plane_size); + memset(ro90_v, 0, uv_plane_size); + memset(ro270_y, 0, y_plane_size); + memset(ro270_u, 0, uv_plane_size); + memset(ro270_v, 0, uv_plane_size); // fill image buffers with random data for (i = b; i < (yh + b); ++i) { for (j = b; j < (yw + b); ++j) { - orig_y[i * (yw + (2 * b)) + j] = random() & 0xff; + orig_y[i * (yw + b * 2) + j] = random() & 0xff; } } for (i = b; i < (uvh + b); ++i) { for (j = b; j < (uvw + b); ++j) { - orig_u[i * (uvw + (2 * b)) + j] = random() & 0xff; - orig_v[i * (uvw + (2 * b)) + j] = random() & 0xff; + orig_u[i * (uvw + b * 2) + j] = random() & 0xff; + orig_v[i * (uvw + b * 2) + j] = random() & 0xff; } } - int y_off_0 = b * (yw + (2 * b)) + b; - int uv_off_0 = b * (uvw + (2 * b)) + b; - int y_off_90 = b * (yh + (2 * b)) + b; - int uv_off_90 = b * (uvh + (2 * b)) + b; + int y_off_0 = b * (yw + b * 2) + b; + int uv_off_0 = b * (uvw + b * 2) + b; + int y_off_90 = b * (yh + b * 2) + b; + int uv_off_90 = b * (uvh + b * 2) + b; - int y_st_0 = yw + (2 * b); - int uv_st_0 = uvw + (2 * b); - int y_st_90 = yh + (2 * b); - int uv_st_90 = uvh + (2 * b); + int y_st_0 = yw + b * 2; + int uv_st_0 = uvw + b * 2; + int y_st_90 = yh + b * 2; + int uv_st_90 = uvh + b * 2; I420Rotate(orig_y+y_off_0, y_st_0, orig_u+uv_off_0, uv_st_0, @@ -805,39 +789,38 @@ TEST_F(libyuvTest, I420Rotate90) { kRotateClockwise); for (i = 0; i < y_plane_size; ++i) { - if (orig_y[i] != ro0_y[i]) + if (orig_y[i] != ro0_y[i]) { ++err; + } } for (i = 0; i < uv_plane_size; ++i) { - if (orig_u[i] != ro0_u[i]) + if (orig_u[i] != ro0_u[i]) { ++err; - if (orig_v[i] != ro0_v[i]) + } + if (orig_v[i] != ro0_v[i]) { ++err; + } } - free(orig_y); - free(orig_u); - free(orig_v); - free(ro0_y); - free(ro0_u); - free(ro0_v); - free(ro90_y); - free(ro90_u); - free(ro90_v); - free(ro270_y); - free(ro270_u); - free(ro270_v); + free_aligned_buffer_16(orig_y) + free_aligned_buffer_16(orig_u) + free_aligned_buffer_16(orig_v) + free_aligned_buffer_16(ro0_y) + free_aligned_buffer_16(ro0_u) + free_aligned_buffer_16(ro0_v) + free_aligned_buffer_16(ro90_y) + free_aligned_buffer_16(ro90_u) + free_aligned_buffer_16(ro90_v) + free_aligned_buffer_16(ro270_y) + free_aligned_buffer_16(ro270_u) + free_aligned_buffer_16(ro270_v) EXPECT_EQ(0, err); } TEST_F(libyuvTest, I420Rotate270) { int err = 0; - uint8 *orig_y, *orig_u, *orig_v; - uint8 *ro0_y, *ro0_u, *ro0_v; - uint8 *ro90_y, *ro90_u, *ro90_v; - uint8 *ro270_y, *ro270_u, *ro270_v; int yw = 1024; int yh = 768; @@ -847,50 +830,59 @@ TEST_F(libyuvTest, I420Rotate270) { int i, j; - int y_plane_size = (yw + (2 * b)) * (yh + (2 * b)); - int uv_plane_size = (uvw + (2 * b)) * (uvh + (2 * b)); + int y_plane_size = (yw + b * 2) * (yh + b * 2); + int uv_plane_size = (uvw + b * 2) * (uvh + b * 2); srandom(time(NULL)); - orig_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); - orig_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); - orig_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); - - ro0_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); - ro0_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); - ro0_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); - - ro90_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); - ro90_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); - ro90_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); - - ro270_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); - ro270_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); - ro270_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + align_buffer_16(orig_y, y_plane_size) + align_buffer_16(orig_u, uv_plane_size) + align_buffer_16(orig_v, uv_plane_size) + align_buffer_16(ro0_y, y_plane_size) + align_buffer_16(ro0_u, uv_plane_size) + align_buffer_16(ro0_v, uv_plane_size) + align_buffer_16(ro90_y, y_plane_size) + align_buffer_16(ro90_u, uv_plane_size) + align_buffer_16(ro90_v, uv_plane_size) + align_buffer_16(ro270_y, y_plane_size) + align_buffer_16(ro270_u, uv_plane_size) + align_buffer_16(ro270_v, uv_plane_size) + memset(orig_y, 0, y_plane_size); + memset(orig_u, 0, uv_plane_size); + memset(orig_v, 0, uv_plane_size); + memset(ro0_y, 0, y_plane_size); + memset(ro0_u, 0, uv_plane_size); + memset(ro0_v, 0, uv_plane_size); + memset(ro90_y, 0, y_plane_size); + memset(ro90_u, 0, uv_plane_size); + memset(ro90_v, 0, uv_plane_size); + memset(ro270_y, 0, y_plane_size); + memset(ro270_u, 0, uv_plane_size); + memset(ro270_v, 0, uv_plane_size); // fill image buffers with random data for (i = b; i < (yh + b); ++i) { for (j = b; j < (yw + b); ++j) { - orig_y[i * (yw + (2 * b)) + j] = random() & 0xff; + orig_y[i * (yw + b * 2) + j] = random() & 0xff; } } for (i = b; i < (uvh + b); ++i) { for (j = b; j < (uvw + b); ++j) { - orig_u[i * (uvw + (2 * b)) + j] = random() & 0xff; - orig_v[i * (uvw + (2 * b)) + j] = random() & 0xff; + orig_u[i * (uvw + b * 2) + j] = random() & 0xff; + orig_v[i * (uvw + b * 2) + j] = random() & 0xff; } } - int y_off_0 = b * (yw + (2 * b)) + b; - int uv_off_0 = b * (uvw + (2 * b)) + b; - int y_off_90 = b * (yh + (2 * b)) + b; - int uv_off_90 = b * (uvh + (2 * b)) + b; + int y_off_0 = b * (yw + b * 2) + b; + int uv_off_0 = b * (uvw + b * 2) + b; + int y_off_90 = b * (yh + b * 2) + b; + int uv_off_90 = b * (uvh + b * 2) + b; - int y_st_0 = yw + (2 * b); - int uv_st_0 = uvw + (2 * b); - int y_st_90 = yh + (2 * b); - int uv_st_90 = uvh + (2 * b); + int y_st_0 = yw + b * 2; + int uv_st_0 = uvw + b * 2; + int y_st_90 = yh + b * 2; + int uv_st_90 = uvh + b * 2; I420Rotate(orig_y+y_off_0, y_st_0, orig_u+uv_off_0, uv_st_0, @@ -920,38 +912,38 @@ TEST_F(libyuvTest, I420Rotate270) { kRotateCounterClockwise); for (i = 0; i < y_plane_size; ++i) { - if (orig_y[i] != ro0_y[i]) + if (orig_y[i] != ro0_y[i]) { ++err; + } } for (i = 0; i < uv_plane_size; ++i) { - if (orig_u[i] != ro0_u[i]) + if (orig_u[i] != ro0_u[i]) { ++err; - if (orig_v[i] != ro0_v[i]) + } + if (orig_v[i] != ro0_v[i]) { ++err; + } } - free(orig_y); - free(orig_u); - free(orig_v); - free(ro0_y); - free(ro0_u); - free(ro0_v); - free(ro90_y); - free(ro90_u); - free(ro90_v); - free(ro270_y); - free(ro270_u); - free(ro270_v); + free_aligned_buffer_16(orig_y) + free_aligned_buffer_16(orig_u) + free_aligned_buffer_16(orig_v) + free_aligned_buffer_16(ro0_y) + free_aligned_buffer_16(ro0_u) + free_aligned_buffer_16(ro0_v) + free_aligned_buffer_16(ro90_y) + free_aligned_buffer_16(ro90_u) + free_aligned_buffer_16(ro90_v) + free_aligned_buffer_16(ro270_y) + free_aligned_buffer_16(ro270_u) + free_aligned_buffer_16(ro270_v) EXPECT_EQ(0, err); } TEST_F(libyuvTest, NV12ToI420Rotate90) { int err = 0; - uint8 *orig_y, *orig_uv; - uint8 *ro0_y, *ro0_u, *ro0_v; - uint8 *ro90_y, *ro90_u, *ro90_v; int yw = 1024; int yh = 768; @@ -960,47 +952,53 @@ TEST_F(libyuvTest, NV12ToI420Rotate90) { int uvh = (yh + 1) >> 1; int i, j; - int y_plane_size = (yw + (2 * b)) * (yh + (2 * b)); - int uv_plane_size = (uvw + (2 * b)) * (uvh + (2 * b)); - int o_uv_plane_size = ((2 * uvw) + (2 * b)) * (uvh + (2 * b)); + int y_plane_size = (yw + b * 2) * (yh + b * 2); + int uv_plane_size = (uvw + b * 2) * (uvh + b * 2); + int o_uv_plane_size = (uvw * 2 + b * 2) * (uvh + b * 2); srandom(time(NULL)); - orig_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); - orig_uv = static_cast<uint8*>(calloc(o_uv_plane_size, sizeof(uint8))); - - ro0_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); - ro0_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); - ro0_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); - - ro90_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); - ro90_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); - ro90_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + align_buffer_16(orig_y, y_plane_size) + align_buffer_16(orig_uv, o_uv_plane_size) + align_buffer_16(ro0_y, y_plane_size) + align_buffer_16(ro0_u, uv_plane_size) + align_buffer_16(ro0_v, uv_plane_size) + align_buffer_16(ro90_y, y_plane_size) + align_buffer_16(ro90_u, uv_plane_size) + align_buffer_16(ro90_v, uv_plane_size) + memset(orig_y, 0, y_plane_size); + memset(orig_uv, 0, uv_plane_size); + memset(ro0_y, 0, y_plane_size); + memset(ro0_u, 0, uv_plane_size); + memset(ro0_v, 0, uv_plane_size); + memset(ro90_y, 0, y_plane_size); + memset(ro90_u, 0, uv_plane_size); + memset(ro90_v, 0, uv_plane_size); // fill image buffers with random data for (i = b; i < (yh + b); ++i) { for (j = b; j < (yw + b); ++j) { - orig_y[i * (yw + (2 * b)) + j] = random() & 0xff; + orig_y[i * (yw + b * 2) + j] = random() & 0xff; } } for (i = b; i < (uvh + b); ++i) { - for (j = b; j < ((2 * uvw) + b); j += 2) { + for (j = b; j < (uvw * 2 + b); j += 2) { uint8 random_number = random() & 0x7f; - orig_uv[i * ((2 * uvw) + (2 * b)) + j] = random_number; - orig_uv[i * ((2 * uvw) + (2 * b)) + j + 1] = -random_number; + orig_uv[i * (uvw * 2 + b * 2) + j] = random_number; + orig_uv[i * (uvw * 2 + b * 2) + j + 1] = -random_number; } } - int y_off_0 = b * (yw + (2 * b)) + b; - int uv_off_0 = b * (uvw + (2 * b)) + b; - int y_off_90 = b * (yh + (2 * b)) + b; - int uv_off_90 = b * (uvh + (2 * b)) + b; + int y_off_0 = b * (yw + b * 2) + b; + int uv_off_0 = b * (uvw + b * 2) + b; + int y_off_90 = b * (yh + b * 2) + b; + int uv_off_90 = b * (uvh + b * 2) + b; - int y_st_0 = yw + (2 * b); - int uv_st_0 = uvw + (2 * b); - int y_st_90 = yh + (2 * b); - int uv_st_90 = uvh + (2 * b); + int y_st_0 = yw + b * 2; + int uv_st_0 = uvw + b * 2; + int y_st_90 = yh + b * 2; + int uv_st_90 = uvh + b * 2; NV12ToI420Rotate(orig_y+y_off_0, y_st_0, orig_uv+y_off_0, y_st_0, @@ -1027,32 +1025,32 @@ TEST_F(libyuvTest, NV12ToI420Rotate90) { int zero_cnt = 0; for (i = 0; i < uv_plane_size; ++i) { - if ((signed char)ro0_u[i] != -(signed char)ro0_v[i]) + if ((signed char)ro0_u[i] != -(signed char)ro0_v[i]) { ++err; - if (ro0_u[i] != 0) + } + if (ro0_u[i] != 0) { ++zero_cnt; + } } - if (!zero_cnt) + if (!zero_cnt) { ++err; + } - free(orig_y); - free(orig_uv); - free(ro0_y); - free(ro0_u); - free(ro0_v); - free(ro90_y); - free(ro90_u); - free(ro90_v); + free_aligned_buffer_16(orig_y) + free_aligned_buffer_16(orig_uv) + free_aligned_buffer_16(ro0_y) + free_aligned_buffer_16(ro0_u) + free_aligned_buffer_16(ro0_v) + free_aligned_buffer_16(ro90_y) + free_aligned_buffer_16(ro90_u) + free_aligned_buffer_16(ro90_v) EXPECT_EQ(0, err); } TEST_F(libyuvTest, NV12ToI420Rotate270) { int err = 0; - uint8 *orig_y, *orig_uv; - uint8 *ro0_y, *ro0_u, *ro0_v; - uint8 *ro270_y, *ro270_u, *ro270_v; int yw = 1024; int yh = 768; @@ -1062,47 +1060,53 @@ TEST_F(libyuvTest, NV12ToI420Rotate270) { int i, j; - int y_plane_size = (yw + (2 * b)) * (yh + (2 * b)); - int uv_plane_size = (uvw + (2 * b)) * (uvh + (2 * b)); - int o_uv_plane_size = ((2 * uvw) + (2 * b)) * (uvh + (2 * b)); + int y_plane_size = (yw + b * 2) * (yh + b * 2); + int uv_plane_size = (uvw + b * 2) * (uvh + b * 2); + int o_uv_plane_size = (uvw * 2 + b * 2) * (uvh + b * 2); srandom(time(NULL)); - orig_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); - orig_uv = static_cast<uint8*>(calloc(o_uv_plane_size, sizeof(uint8))); - - ro0_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); - ro0_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); - ro0_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); - - ro270_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); - ro270_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); - ro270_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + align_buffer_16(orig_y, y_plane_size) + align_buffer_16(orig_uv, o_uv_plane_size) + align_buffer_16(ro0_y, y_plane_size) + align_buffer_16(ro0_u, uv_plane_size) + align_buffer_16(ro0_v, uv_plane_size) + align_buffer_16(ro270_y, y_plane_size) + align_buffer_16(ro270_u, uv_plane_size) + align_buffer_16(ro270_v, uv_plane_size) + memset(orig_y, 0, y_plane_size); + memset(orig_uv, 0, o_uv_plane_size); + memset(ro0_y, 0, y_plane_size); + memset(ro0_u, 0, uv_plane_size); + memset(ro0_v, 0, uv_plane_size); + memset(ro270_y, 0, y_plane_size); + memset(ro270_u, 0, uv_plane_size); + memset(ro270_v, 0, uv_plane_size); // fill image buffers with random data for (i = b; i < (yh + b); ++i) { for (j = b; j < (yw + b); ++j) { - orig_y[i * (yw + (2 * b)) + j] = random() & 0xff; + orig_y[i * (yw + b * 2) + j] = random() & 0xff; } } for (i = b; i < (uvh + b); ++i) { - for (j = b; j < ((2 * uvw) + b); j += 2) { + for (j = b; j < (uvw * 2 + b); j += 2) { uint8 random_number = random() & 0x7f; - orig_uv[i * ((2 * uvw) + (2 * b)) + j] = random_number; - orig_uv[i * ((2 * uvw) + (2 * b)) + j + 1] = -random_number; + orig_uv[i * (uvw * 2 + b * 2) + j] = random_number; + orig_uv[i * (uvw * 2 + b * 2) + j + 1] = -random_number; } } - int y_off_0 = b * (yw + (2 * b)) + b; - int uv_off_0 = b * (uvw + (2 * b)) + b; - int y_off_270 = b * (yh + (2 * b)) + b; - int uv_off_270 = b * (uvh + (2 * b)) + b; + int y_off_0 = b * (yw + b * 2) + b; + int uv_off_0 = b * (uvw + b * 2) + b; + int y_off_270 = b * (yh + b * 2) + b; + int uv_off_270 = b * (uvh + b * 2) + b; - int y_st_0 = yw + (2 * b); - int uv_st_0 = uvw + (2 * b); - int y_st_270 = yh + (2 * b); - int uv_st_270 = uvh + (2 * b); + int y_st_0 = yw + b * 2; + int uv_st_0 = uvw + b * 2; + int y_st_270 = yh + b * 2; + int uv_st_270 = uvh + b * 2; NV12ToI420Rotate(orig_y+y_off_0, y_st_0, orig_uv+y_off_0, y_st_0, @@ -1129,32 +1133,32 @@ TEST_F(libyuvTest, NV12ToI420Rotate270) { int zero_cnt = 0; for (i = 0; i < uv_plane_size; ++i) { - if ((signed char)ro0_u[i] != -(signed char)ro0_v[i]) + if ((signed char)ro0_u[i] != -(signed char)ro0_v[i]) { ++err; - if (ro0_u[i] != 0) + } + if (ro0_u[i] != 0) { ++zero_cnt; + } } - if (!zero_cnt) + if (!zero_cnt) { ++err; + } - free(orig_y); - free(orig_uv); - free(ro0_y); - free(ro0_u); - free(ro0_v); - free(ro270_y); - free(ro270_u); - free(ro270_v); + free_aligned_buffer_16(orig_y) + free_aligned_buffer_16(orig_uv) + free_aligned_buffer_16(ro0_y) + free_aligned_buffer_16(ro0_u) + free_aligned_buffer_16(ro0_v) + free_aligned_buffer_16(ro270_y) + free_aligned_buffer_16(ro270_u) + free_aligned_buffer_16(ro270_v) EXPECT_EQ(0, err); } TEST_F(libyuvTest, NV12ToI420Rotate180) { int err = 0; - uint8 *orig_y, *orig_uv; - uint8 *ro0_y, *ro0_u, *ro0_v; - uint8 *ro180_y, *ro180_u, *ro180_v; int yw = 1024; int yh = 768; @@ -1164,43 +1168,49 @@ TEST_F(libyuvTest, NV12ToI420Rotate180) { int i, j; - int y_plane_size = (yw + (2 * b)) * (yh + (2 * b)); - int uv_plane_size = (uvw + (2 * b)) * (uvh + (2 * b)); - int o_uv_plane_size = ((2 * uvw) + (2 * b)) * (uvh + (2 * b)); + int y_plane_size = (yw + b * 2) * (yh + b * 2); + int uv_plane_size = (uvw + b * 2) * (uvh + b * 2); + int o_uv_plane_size = (uvw * 2 + b * 2) * (uvh + b * 2); srandom(time(NULL)); - orig_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); - orig_uv = static_cast<uint8*>(calloc(o_uv_plane_size, sizeof(uint8))); - - ro0_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); - ro0_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); - ro0_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); - - ro180_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); - ro180_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); - ro180_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + align_buffer_16(orig_y, y_plane_size) + align_buffer_16(orig_uv, o_uv_plane_size) + align_buffer_16(ro0_y, y_plane_size) + align_buffer_16(ro0_u, uv_plane_size) + align_buffer_16(ro0_v, uv_plane_size) + align_buffer_16(ro180_y, y_plane_size) + align_buffer_16(ro180_u, uv_plane_size) + align_buffer_16(ro180_v, uv_plane_size) + memset(orig_y, 0, y_plane_size); + memset(orig_uv, 0, o_uv_plane_size); + memset(ro0_y, 0, y_plane_size); + memset(ro0_u, 0, uv_plane_size); + memset(ro0_v, 0, uv_plane_size); + memset(ro180_y, 0, y_plane_size); + memset(ro180_u, 0, uv_plane_size); + memset(ro180_v, 0, uv_plane_size); // fill image buffers with random data for (i = b; i < (yh + b); ++i) { for (j = b; j < (yw + b); ++j) { - orig_y[i * (yw + (2 * b)) + j] = random() & 0xff; + orig_y[i * (yw + b * 2) + j] = random() & 0xff; } } for (i = b; i < (uvh + b); ++i) { - for (j = b; j < ((2 * uvw) + b); j += 2) { + for (j = b; j < (uvw * 2 + b); j += 2) { uint8 random_number = random() & 0x7f; - orig_uv[i * ((2 * uvw) + (2 * b)) + j] = random_number; - orig_uv[i * ((2 * uvw) + (2 * b)) + j + 1] = -random_number; + orig_uv[i * (uvw * 2 + b * 2) + j] = random_number; + orig_uv[i * (uvw * 2 + b * 2) + j + 1] = -random_number; } } - int y_off = b * (yw + (2 * b)) + b; - int uv_off = b * (uvw + (2 * b)) + b; + int y_off = b * (yw + b * 2) + b; + int uv_off = b * (uvw + b * 2) + b; - int y_st = yw + (2 * b); - int uv_st = uvw + (2 * b); + int y_st = yw + b * 2; + int uv_st = uvw + b * 2; NV12ToI420Rotate(orig_y+y_off, y_st, orig_uv+y_off, y_st, @@ -1220,40 +1230,40 @@ TEST_F(libyuvTest, NV12ToI420Rotate180) { kRotate180); for (i = 0; i < y_plane_size; ++i) { - if (orig_y[i] != ro0_y[i]) + if (orig_y[i] != ro0_y[i]) { ++err; + } } int zero_cnt = 0; for (i = 0; i < uv_plane_size; ++i) { - if ((signed char)ro0_u[i] != -(signed char)ro0_v[i]) + if ((signed char)ro0_u[i] != -(signed char)ro0_v[i]) { ++err; - if (ro0_u[i] != 0) + } + if (ro0_u[i] != 0) { ++zero_cnt; + } } - if (!zero_cnt) + if (!zero_cnt) { ++err; + } - free(orig_y); - free(orig_uv); - free(ro0_y); - free(ro0_u); - free(ro0_v); - free(ro180_y); - free(ro180_u); - free(ro180_v); + free_aligned_buffer_16(orig_y) + free_aligned_buffer_16(orig_uv) + free_aligned_buffer_16(ro0_y) + free_aligned_buffer_16(ro0_u) + free_aligned_buffer_16(ro0_v) + free_aligned_buffer_16(ro180_y) + free_aligned_buffer_16(ro180_u) + free_aligned_buffer_16(ro180_v) EXPECT_EQ(0, err); } TEST_F(libyuvTest, NV12ToI420RotateNegHeight90) { int y_err = 0, uv_err = 0; - uint8 *orig_y, *orig_uv; - uint8 *roa_y, *roa_u, *roa_v; - uint8 *rob_y, *rob_u, *rob_v; - uint8 *roc_y, *roc_u, *roc_v; int yw = 1024; int yh = 768; @@ -1262,51 +1272,59 @@ TEST_F(libyuvTest, NV12ToI420RotateNegHeight90) { int uvh = (yh + 1) >> 1; int i, j; - int y_plane_size = (yw + (2 * b)) * (yh + (2 * b)); - int uv_plane_size = (uvw + (2 * b)) * (uvh + (2 * b)); - int o_uv_plane_size = ((2 * uvw) + (2 * b)) * (uvh + (2 * b)); + int y_plane_size = (yw + b * 2) * (yh + b * 2); + int uv_plane_size = (uvw + b * 2) * (uvh + b * 2); + int o_uv_plane_size = (uvw * 2 + b * 2) * (uvh + b * 2); srandom(time(NULL)); - orig_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); - orig_uv = static_cast<uint8*>(calloc(o_uv_plane_size, sizeof(uint8))); - - roa_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); - roa_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); - roa_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); - - rob_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); - rob_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); - rob_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); - - roc_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); - roc_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); - roc_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + align_buffer_16(orig_y, y_plane_size) + align_buffer_16(orig_uv, o_uv_plane_size) + align_buffer_16(roa_y, y_plane_size) + align_buffer_16(roa_u, uv_plane_size) + align_buffer_16(roa_v, uv_plane_size) + align_buffer_16(rob_y, y_plane_size) + align_buffer_16(rob_u, uv_plane_size) + align_buffer_16(rob_v, uv_plane_size) + align_buffer_16(roc_y, y_plane_size) + align_buffer_16(roc_u, uv_plane_size) + align_buffer_16(roc_v, uv_plane_size) + memset(orig_y, 0, y_plane_size); + memset(orig_uv, 0, o_uv_plane_size); + memset(roa_y, 0, y_plane_size); + memset(roa_u, 0, uv_plane_size); + memset(roa_v, 0, uv_plane_size); + memset(rob_y, 0, y_plane_size); + memset(rob_u, 0, uv_plane_size); + memset(rob_v, 0, uv_plane_size); + memset(roc_y, 0, y_plane_size); + memset(roc_u, 0, uv_plane_size); + memset(roc_v, 0, uv_plane_size); // fill image buffers with random data for (i = b; i < (yh + b); ++i) { for (j = b; j < (yw + b); ++j) { - orig_y[i * (yw + (2 * b)) + j] = random() & 0xff; + orig_y[i * (yw + b * 2) + j] = random() & 0xff; } } for (i = b; i < (uvh + b); ++i) { - for (j = b; j < ((2 * uvw) + b); j += 2) { + for (j = b; j < (uvw * 2 + b); j += 2) { uint8 random_number = random() & 0x7f; - orig_uv[i * ((2 * uvw) + (2 * b)) + j] = random_number; - orig_uv[i * ((2 * uvw) + (2 * b)) + j + 1] = -random_number; + orig_uv[i * (uvw * 2 + b * 2) + j] = random_number; + orig_uv[i * (uvw * 2 + b * 2) + j + 1] = -random_number; } } - int y_off_0 = b * (yw + (2 * b)) + b; - int uv_off_0 = b * (uvw + (2 * b)) + b; - int y_off_90 = b * (yh + (2 * b)) + b; - int uv_off_90 = b * (uvh + (2 * b)) + b; + int y_off_0 = b * (yw + b * 2) + b; + int uv_off_0 = b * (uvw + b * 2) + b; + int y_off_90 = b * (yh + b * 2) + b; + int uv_off_90 = b * (uvh + b * 2) + b; - int y_st_0 = yw + (2 * b); - int uv_st_0 = uvw + (2 * b); - int y_st_90 = yh + (2 * b); - int uv_st_90 = uvh + (2 * b); + int y_st_0 = yw + b * 2; + int uv_st_0 = uvw + b * 2; + int y_st_90 = yh + b * 2; + int uv_st_90 = uvh + b * 2; NV12ToI420Rotate(orig_y+y_off_0, y_st_0, orig_uv+y_off_0, y_st_0, @@ -1335,73 +1353,74 @@ TEST_F(libyuvTest, NV12ToI420RotateNegHeight90) { kRotate180); for (i = 0; i < y_plane_size; ++i) { - if (orig_y[i] != roc_y[i]) + if (orig_y[i] != roc_y[i]) { ++y_err; + } } if (y_err) { printf("input %dx%d \n", yw, yh); - print_array(orig_y, y_st_0, yh + (2 * b)); + PrintArray(orig_y, y_st_0, yh + b * 2); printf("rotate a\n"); - print_array(roa_y, y_st_90, y_st_0); + PrintArray(roa_y, y_st_90, y_st_0); printf("rotate b\n"); - print_array(rob_y, y_st_90, y_st_0); + PrintArray(rob_y, y_st_90, y_st_0); printf("rotate c\n"); - print_array(roc_y, y_st_0, y_st_90); + PrintArray(roc_y, y_st_0, y_st_90); } int zero_cnt = 0; for (i = 0; i < uv_plane_size; ++i) { - if ((signed char)roc_u[i] != -(signed char)roc_v[i]) + if ((signed char)roc_u[i] != -(signed char)roc_v[i]) { ++uv_err; - if (rob_u[i] != 0) + } + if (rob_u[i] != 0) { ++zero_cnt; + } } - if (!zero_cnt) + if (!zero_cnt) { ++uv_err; + } if (uv_err) { - printf("input %dx%d \n", (2 * uvw), uvh); - print_array(orig_uv, y_st_0, uvh + (2 * b)); + printf("input %dx%d \n", uvw * 2, uvh); + PrintArray(orig_uv, y_st_0, uvh + b * 2); printf("rotate a\n"); - print_array(roa_u, uv_st_90, uv_st_0); - print_array(roa_v, uv_st_90, uv_st_0); + PrintArray(roa_u, uv_st_90, uv_st_0); + PrintArray(roa_v, uv_st_90, uv_st_0); printf("rotate b\n"); - print_array(rob_u, uv_st_90, uv_st_0); - print_array(rob_v, uv_st_90, uv_st_0); + PrintArray(rob_u, uv_st_90, uv_st_0); + PrintArray(rob_v, uv_st_90, uv_st_0); printf("rotate c\n"); - print_array(roc_u, uv_st_0, uv_st_90); - print_array(roc_v, uv_st_0, uv_st_90); + PrintArray(roc_u, uv_st_0, uv_st_90); + PrintArray(roc_v, uv_st_0, uv_st_90); } - free(orig_y); - free(orig_uv); - free(roa_y); - free(roa_u); - free(roa_v); - free(rob_y); - free(rob_u); - free(rob_v); - free(roc_y); - free(roc_u); - free(roc_v); + free_aligned_buffer_16(orig_y) + free_aligned_buffer_16(orig_uv) + free_aligned_buffer_16(roa_y) + free_aligned_buffer_16(roa_u) + free_aligned_buffer_16(roa_v) + free_aligned_buffer_16(rob_y) + free_aligned_buffer_16(rob_u) + free_aligned_buffer_16(rob_v) + free_aligned_buffer_16(roc_y) + free_aligned_buffer_16(roc_u) + free_aligned_buffer_16(roc_v) EXPECT_EQ(0, y_err + uv_err); } TEST_F(libyuvTest, NV12ToI420RotateNegHeight180) { int y_err = 0, uv_err = 0; - uint8 *orig_y, *orig_uv; - uint8 *roa_y, *roa_u, *roa_v; - uint8 *rob_y, *rob_u, *rob_v; int yw = 1024; int yh = 768; @@ -1410,43 +1429,49 @@ TEST_F(libyuvTest, NV12ToI420RotateNegHeight180) { int uvh = (yh + 1) >> 1; int i, j; - int y_plane_size = (yw + (2 * b)) * (yh + (2 * b)); - int uv_plane_size = (uvw + (2 * b)) * (uvh + (2 * b)); - int o_uv_plane_size = ((2 * uvw) + (2 * b)) * (uvh + (2 * b)); + int y_plane_size = (yw + b * 2) * (yh + b * 2); + int uv_plane_size = (uvw + b * 2) * (uvh + b * 2); + int o_uv_plane_size = (uvw * 2 + b * 2) * (uvh + b * 2); srandom(time(NULL)); - orig_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); - orig_uv = static_cast<uint8*>(calloc(o_uv_plane_size, sizeof(uint8))); - - roa_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); - roa_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); - roa_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); - - rob_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); - rob_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); - rob_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + align_buffer_16(orig_y, y_plane_size) + align_buffer_16(orig_uv, o_uv_plane_size) + align_buffer_16(roa_y, y_plane_size) + align_buffer_16(roa_u, uv_plane_size) + align_buffer_16(roa_v, uv_plane_size) + align_buffer_16(rob_y, y_plane_size) + align_buffer_16(rob_u, uv_plane_size) + align_buffer_16(rob_v, uv_plane_size) + memset(orig_y, 0, y_plane_size); + memset(orig_uv, 0, o_uv_plane_size); + memset(roa_y, 0, y_plane_size); + memset(roa_u, 0, uv_plane_size); + memset(roa_v, 0, uv_plane_size); + memset(rob_y, 0, y_plane_size); + memset(rob_u, 0, uv_plane_size); + memset(rob_v, 0, uv_plane_size); // fill image buffers with random data for (i = b; i < (yh + b); ++i) { for (j = b; j < (yw + b); ++j) { - orig_y[i * (yw + (2 * b)) + j] = random() & 0xff; + orig_y[i * (yw + b * 2) + j] = random() & 0xff; } } for (i = b; i < (uvh + b); ++i) { - for (j = b; j < ((2 * uvw) + b); j += 2) { + for (j = b; j < (uvw * 2 + b); j += 2) { uint8 random_number = random() & 0x7f; - orig_uv[i * ((2 * uvw) + (2 * b)) + j] = random_number; - orig_uv[i * ((2 * uvw) + (2 * b)) + j + 1] = -random_number; + orig_uv[i * (uvw * 2 + b * 2) + j] = random_number; + orig_uv[i * (uvw * 2 + b * 2) + j + 1] = -random_number; } } - int y_off = b * (yw + (2 * b)) + b; - int uv_off = b * (uvw + (2 * b)) + b; + int y_off = b * (yw + b * 2) + b; + int uv_off = b * (uvw + b * 2) + b; - int y_st = yw + (2 * b); - int uv_st = uvw + (2 * b); + int y_st = yw + b * 2; + int uv_st = uvw + b * 2; NV12ToI420Rotate(orig_y+y_off, y_st, orig_uv+y_off, y_st, @@ -1472,48 +1497,53 @@ TEST_F(libyuvTest, NV12ToI420RotateNegHeight180) { if (y_err) { printf("input %dx%d \n", yw, yh); - print_array(orig_y, y_st, yh + (2 * b)); + PrintArray(orig_y, y_st, yh + b * 2); printf("rotate a\n"); - print_array(roa_y, y_st, yh + (2 * b)); + PrintArray(roa_y, y_st, yh + b * 2); printf("rotate b\n"); - print_array(rob_y, y_st, yh + (2 * b)); + PrintArray(rob_y, y_st, yh + b * 2); } int zero_cnt = 0; for (i = 0; i < uv_plane_size; ++i) { - if ((signed char)rob_u[i] != -(signed char)rob_v[i]) + if ((signed char)rob_u[i] != -(signed char)rob_v[i]) { ++uv_err; - if (rob_u[i] != 0) + } + if (rob_u[i] != 0) { ++zero_cnt; + } } - if (!zero_cnt) + if (!zero_cnt) { ++uv_err; + } if (uv_err) { - printf("input %dx%d \n", (2 * uvw), uvh); - print_array(orig_uv, y_st, uvh + (2 * b)); + printf("input %dx%d \n", uvw * 2, uvh); + PrintArray(orig_uv, y_st, uvh + b * 2); printf("rotate a\n"); - print_array(roa_u, uv_st, uvh + (2 * b)); - print_array(roa_v, uv_st, uvh + (2 * b)); + PrintArray(roa_u, uv_st, uvh + b * 2); + PrintArray(roa_v, uv_st, uvh + b * 2); printf("rotate b\n"); - print_array(rob_u, uv_st, uvh + (2 * b)); - print_array(rob_v, uv_st, uvh + (2 * b)); + PrintArray(rob_u, uv_st, uvh + b * 2); + PrintArray(rob_v, uv_st, uvh + b * 2); } - free(orig_y); - free(orig_uv); - free(roa_y); - free(roa_u); - free(roa_v); - free(rob_y); - free(rob_u); - free(rob_v); + free_aligned_buffer_16(orig_y) + free_aligned_buffer_16(orig_uv) + free_aligned_buffer_16(roa_y) + free_aligned_buffer_16(roa_u) + free_aligned_buffer_16(roa_v) + free_aligned_buffer_16(rob_y) + free_aligned_buffer_16(rob_u) + free_aligned_buffer_16(rob_v) EXPECT_EQ(0, y_err + uv_err); } + +} // namespace libyuv diff --git a/files/unit_test/scale_argb_test.cc b/files/unit_test/scale_argb_test.cc new file mode 100644 index 00000000..fef96764 --- /dev/null +++ b/files/unit_test/scale_argb_test.cc @@ -0,0 +1,255 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stdlib.h> +#include <time.h> + +#include "libyuv/cpu_id.h" +#include "libyuv/scale_argb.h" +#include "../unit_test/unit_test.h" + +namespace libyuv { + +static int ARGBTestFilter(int src_width, int src_height, + int dst_width, int dst_height, + FilterMode f, int benchmark_iterations) { + const int b = 128; + int src_argb_plane_size = (src_width + b * 2) * (src_height + b * 2) * 4; + int src_stride_argb = (b * 2 + src_width) * 4; + + align_buffer_16(src_argb, src_argb_plane_size) + memset(src_argb, 1, src_argb_plane_size); + + int dst_argb_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 4; + int dst_stride_argb = (b * 2 + dst_width) * 4; + + srandom(time(NULL)); + + int i, j; + for (i = b; i < (src_height + b); ++i) { + for (j = b; j < (src_width + b) * 4; ++j) { + src_argb[(i * src_stride_argb) + j] = (random() & 0xff); + } + } + + align_buffer_16(dst_argb_c, dst_argb_plane_size) + align_buffer_16(dst_argb_opt, dst_argb_plane_size) + memset(dst_argb_c, 2, dst_argb_plane_size); + memset(dst_argb_opt, 3, dst_argb_plane_size); + + // Warm up both versions for consistent benchmarks. + MaskCpuFlags(0); // Disable all CPU optimization. + ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb, + src_width, src_height, + dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb, + dst_width, dst_height, f); + MaskCpuFlags(-1); // Enable all CPU optimization. + ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb, + src_width, src_height, + dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb, + dst_width, dst_height, f); + + MaskCpuFlags(0); // Disable all CPU optimization. + double c_time = get_time(); + for (i = 0; i < benchmark_iterations; ++i) { + ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb, + src_width, src_height, + dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb, + dst_width, dst_height, f); + } + c_time = (get_time() - c_time) / benchmark_iterations; + + MaskCpuFlags(-1); // Enable all CPU optimization. + double opt_time = get_time(); + for (i = 0; i < benchmark_iterations; ++i) { + ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb, + src_width, src_height, + dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb, + dst_width, dst_height, f); + } + opt_time = (get_time() - opt_time) / benchmark_iterations; + + // Report performance of C vs OPT + printf("filter %d - %8d us C - %8d us OPT\n", + f, static_cast<int>(c_time*1e6), static_cast<int>(opt_time*1e6)); + + // C version may be a little off from the optimized. Order of + // operations may introduce rounding somewhere. So do a difference + // of the buffers and look to see that the max difference isn't + // over 2. + int max_diff = 0; + for (i = b; i < (dst_height + b); ++i) { + for (j = b * 4; j < (dst_width + b) * 4; ++j) { + int abs_diff = abs(dst_argb_c[(i * dst_stride_argb) + j] - + dst_argb_opt[(i * dst_stride_argb) + j]); + if (abs_diff > max_diff) { + max_diff = abs_diff; + } + } + } + + free_aligned_buffer_16(dst_argb_c) + free_aligned_buffer_16(dst_argb_opt) + free_aligned_buffer_16(src_argb) + return max_diff; +} + +TEST_F(libyuvTest, ARGBScaleDownBy2) { + const int src_width = 1280; + const int src_height = 720; + const int dst_width = src_width / 2; + const int dst_height = src_height / 2; + + for (int f = 0; f < 2; ++f) { + int max_diff = ARGBTestFilter(src_width, src_height, + dst_width, dst_height, + static_cast<FilterMode>(f), + benchmark_iterations_); + EXPECT_LE(max_diff, 1); + } +} + +TEST_F(libyuvTest, ARGBScaleDownBy4) { + const int src_width = 1280; + const int src_height = 720; + const int dst_width = src_width / 4; + const int dst_height = src_height / 4; + + for (int f = 0; f < 2; ++f) { + int max_diff = ARGBTestFilter(src_width, src_height, + dst_width, dst_height, + static_cast<FilterMode>(f), + benchmark_iterations_); + EXPECT_LE(max_diff, 1); + } +} + +TEST_F(libyuvTest, ARGBScaleDownBy5) { + const int src_width = 1280; + const int src_height = 720; + const int dst_width = src_width / 5; + const int dst_height = src_height / 5; + + for (int f = 0; f < 2; ++f) { + int max_diff = ARGBTestFilter(src_width, src_height, + dst_width, dst_height, + static_cast<FilterMode>(f), + benchmark_iterations_); + EXPECT_LE(max_diff, 1); + } +} + +TEST_F(libyuvTest, ARGBScaleDownBy8) { + const int src_width = 1280; + const int src_height = 720; + const int dst_width = src_width / 8; + const int dst_height = src_height / 8; + + for (int f = 0; f < 2; ++f) { + int max_diff = ARGBTestFilter(src_width, src_height, + dst_width, dst_height, + static_cast<FilterMode>(f), + benchmark_iterations_); + EXPECT_LE(max_diff, 1); + } +} + +TEST_F(libyuvTest, ARGBScaleDownBy16) { + const int src_width = 1280; + const int src_height = 720; + const int dst_width = src_width / 16; + const int dst_height = src_height / 16; + + for (int f = 0; f < 2; ++f) { + int max_diff = ARGBTestFilter(src_width, src_height, + dst_width, dst_height, + static_cast<FilterMode>(f), + benchmark_iterations_); + EXPECT_LE(max_diff, 1); + } +} + +TEST_F(libyuvTest, ARGBScaleDownBy34) { + const int src_width = 1280; + const int src_height = 720; + const int dst_width = src_width * 3 / 4; + const int dst_height = src_height * 3 / 4; + + for (int f = 0; f < 2; ++f) { + int max_diff = ARGBTestFilter(src_width, src_height, + dst_width, dst_height, + static_cast<FilterMode>(f), + benchmark_iterations_); + EXPECT_LE(max_diff, 1); + } +} + +TEST_F(libyuvTest, ARGBScaleDownBy38) { + int src_width = 1280; + int src_height = 720; + int dst_width = src_width * 3 / 8; + int dst_height = src_height * 3 / 8; + + for (int f = 0; f < 2; ++f) { + int max_diff = ARGBTestFilter(src_width, src_height, + dst_width, dst_height, + static_cast<FilterMode>(f), + benchmark_iterations_); + EXPECT_LE(max_diff, 1); + } +} + +TEST_F(libyuvTest, ARGBScaleTo1366) { + int src_width = 1280; + int src_height = 720; + int dst_width = 1366; + int dst_height = 768; + + for (int f = 0; f < 2; ++f) { + int max_diff = ARGBTestFilter(src_width, src_height, + dst_width, dst_height, + static_cast<FilterMode>(f), + benchmark_iterations_); + EXPECT_LE(max_diff, 1); + } +} + +TEST_F(libyuvTest, ARGBScaleTo4074) { + int src_width = 2880 * 2; + int src_height = 1800; + int dst_width = 4074; + int dst_height = 1272; + + for (int f = 0; f < 2; ++f) { + int max_diff = ARGBTestFilter(src_width, src_height, + dst_width, dst_height, + static_cast<FilterMode>(f), + benchmark_iterations_); + EXPECT_LE(max_diff, 1); + } +} + + +TEST_F(libyuvTest, ARGBScaleTo853) { + int src_width = 1280; + int src_height = 720; + int dst_width = 853; + int dst_height = 480; + + for (int f = 0; f < 2; ++f) { + int max_diff = ARGBTestFilter(src_width, src_height, + dst_width, dst_height, + static_cast<FilterMode>(f), + benchmark_iterations_); + EXPECT_LE(max_diff, 1); + } +} + +} // namespace libyuv diff --git a/files/unit_test/scale_test.cc b/files/unit_test/scale_test.cc index e147d78b..55b4148d 100644 --- a/files/unit_test/scale_test.cc +++ b/files/unit_test/scale_test.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -8,152 +8,369 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "libyuv/scale.h" -#include "unit_test.h" #include <stdlib.h> #include <time.h> -using namespace libyuv; - -#define align_buffer_16(var, size) \ - uint8 *var; \ - uint8 *var##_mem; \ - var##_mem = reinterpret_cast<uint8*>(calloc(size+15, sizeof(uint8))); \ - var = reinterpret_cast<uint8*> \ - ((reinterpret_cast<intptr_t>(var##_mem) + 15) & (~0x0f)); - -#define free_aligned_buffer_16(var) \ - free(var##_mem); \ - var = 0; - -TEST_F(libyuvTest, ScaleDownBy4) { - int b = 128; - int src_width = 1280; - int src_height = 720; - int src_width_uv = (src_width + 1) >> 1; - int src_height_uv = (src_height + 1) >> 1; +#include "libyuv/cpu_id.h" +#include "libyuv/scale.h" +#include "../unit_test/unit_test.h" - int src_y_plane_size = (src_width + (2 * b)) * (src_height + (2 * b)); - int src_uv_plane_size = (src_width_uv + (2 * b)) * (src_height_uv + (2 * b)); +namespace libyuv { - int src_stride_y = 2 * b + src_width; - int src_stride_uv = 2 * b + src_width_uv; +static int TestFilter(int src_width, int src_height, + int dst_width, int dst_height, + FilterMode f, int rounding, int benchmark_iterations) { + const int b = 128 * rounding; + int src_width_uv = (src_width + rounding) >> 1; + int src_height_uv = (src_height + rounding) >> 1; - align_buffer_16(src_y, src_y_plane_size) - align_buffer_16(src_u, src_uv_plane_size) - align_buffer_16(src_v, src_uv_plane_size) + int src_y_plane_size = (src_width + b * 2) * (src_height + b * 2); + int src_uv_plane_size = (src_width_uv + b * 2) * (src_height_uv + b * 2); - int dst_width = src_width >> 2; - int dst_height = src_height >> 2; + int src_stride_y = b * 2 + src_width; + int src_stride_uv = b * 2 + src_width_uv; - int dst_width_uv = (dst_width + 1) >> 1; - int dst_height_uv = (dst_height + 1) >> 1; + align_buffer_page_end(src_y, src_y_plane_size) + align_buffer_page_end(src_u, src_uv_plane_size) + align_buffer_page_end(src_v, src_uv_plane_size) - int dst_y_plane_size = (dst_width + (2 * b)) * (dst_height + (2 * b)); - int dst_uv_plane_size = (dst_width_uv + (2 * b)) * (dst_height_uv + (2 * b)); + int dst_width_uv = (dst_width + rounding) >> 1; + int dst_height_uv = (dst_height + rounding) >> 1; - int dst_stride_y = 2 * b + dst_width; - int dst_stride_uv = 2 * b + dst_width_uv; + int dst_y_plane_size = (dst_width + b * 2) * (dst_height + b * 2); + int dst_uv_plane_size = (dst_width_uv + b * 2) * (dst_height_uv + b * 2); - align_buffer_16(dst_y, dst_y_plane_size) - align_buffer_16(dst_u, dst_uv_plane_size) - align_buffer_16(dst_v, dst_uv_plane_size) + int dst_stride_y = b * 2 + dst_width; + int dst_stride_uv = b * 2 + dst_width_uv; - // create an image with random data reoccurring in 4x4 grid. When the image - // is filtered all the values should be the same. srandom(time(NULL)); - uint8 block_data[16]; - int i, j; - - // Pulling 16 random numbers there is an infinitesimally small - // chance that they are all 0. Then the output will be all 0. - // Output buffer is filled with 0, want to make sure that after the - // filtering something went into the output buffer. - // Avoid this by setting one of the values to 128. Also set the - // random data to at least 1 for when point sampling to prevent - // output all being 0. - block_data[0] = 128; - - for (i = 1; i < 16; i++) - block_data[i] = (random() & 0xfe) + 1; - - for (i = b; i < (src_height + b); i += 4) { - for (j = b; j < (src_width + b); j += 4) { - uint8 *ptr = src_y + (i * src_stride_y) + j; - int k, l; - for (k = 0; k < 4; ++k) - for (l = 0; l < 4; ++l) - ptr[k + src_stride_y * l] = block_data[k + 4 * l]; + for (i = b; i < (src_height + b); ++i) { + for (j = b; j < (src_width + b); ++j) { + src_y[(i * src_stride_y) + j] = (random() & 0xff); } } - for (i = 1; i < 16; i++) - block_data[i] = (random() & 0xfe) + 1; - - for (i = b; i < (src_height_uv + b); i += 4) { - for (j = b; j < (src_width_uv + b); j += 4) { - uint8 *ptru = src_u + (i * src_stride_uv) + j; - uint8 *ptrv = src_v + (i * src_stride_uv) + j; - int k, l; - for (k = 0; k < 4; ++k) - for (l = 0; l < 4; ++l) { - ptru[k + src_stride_uv * l] = block_data[k + 4 * l]; - ptrv[k + src_stride_uv * l] = block_data[k + 4 * l]; - } + for (i = b; i < (src_height_uv + b); ++i) { + for (j = b; j < (src_width_uv + b); ++j) { + src_u[(i * src_stride_uv) + j] = (random() & 0xff); + src_v[(i * src_stride_uv) + j] = (random() & 0xff); } } - int f; - int err = 0; + align_buffer_page_end(dst_y_c, dst_y_plane_size) + align_buffer_page_end(dst_u_c, dst_uv_plane_size) + align_buffer_page_end(dst_v_c, dst_uv_plane_size) + align_buffer_page_end(dst_y_opt, dst_y_plane_size) + align_buffer_page_end(dst_u_opt, dst_uv_plane_size) + align_buffer_page_end(dst_v_opt, dst_uv_plane_size) + + // Warm up both versions for consistent benchmarks. + MaskCpuFlags(0); // Disable all CPU optimization. + I420Scale(src_y + (src_stride_y * b) + b, src_stride_y, + src_u + (src_stride_uv * b) + b, src_stride_uv, + src_v + (src_stride_uv * b) + b, src_stride_uv, + src_width, src_height, + dst_y_c + (dst_stride_y * b) + b, dst_stride_y, + dst_u_c + (dst_stride_uv * b) + b, dst_stride_uv, + dst_v_c + (dst_stride_uv * b) + b, dst_stride_uv, + dst_width, dst_height, f); + MaskCpuFlags(-1); // Enable all CPU optimization. + I420Scale(src_y + (src_stride_y * b) + b, src_stride_y, + src_u + (src_stride_uv * b) + b, src_stride_uv, + src_v + (src_stride_uv * b) + b, src_stride_uv, + src_width, src_height, + dst_y_opt + (dst_stride_y * b) + b, dst_stride_y, + dst_u_opt + (dst_stride_uv * b) + b, dst_stride_uv, + dst_v_opt + (dst_stride_uv * b) + b, dst_stride_uv, + dst_width, dst_height, f); - // currently three filter modes, defined as FilterMode in scale.h - for (f = 0; f < 3; ++f) { + MaskCpuFlags(0); // Disable all CPU optimization. + double c_time = get_time(); + for (i = 0; i < benchmark_iterations; ++i) { I420Scale(src_y + (src_stride_y * b) + b, src_stride_y, src_u + (src_stride_uv * b) + b, src_stride_uv, src_v + (src_stride_uv * b) + b, src_stride_uv, src_width, src_height, - dst_y + (dst_stride_y * b) + b, dst_stride_y, - dst_u + (dst_stride_uv * b) + b, dst_stride_uv, - dst_v + (dst_stride_uv * b) + b, dst_stride_uv, - dst_width, dst_height, - static_cast<FilterMode>(f)); - - int value = dst_y[(dst_stride_y * b) + b]; - - // catch the case that the output buffer is all 0 - if (value == 0) - ++err; - - for (i = b; i < (dst_height + b); ++i) { - for (j = b; j < (dst_width + b); ++j) { - if (value != dst_y[(i * dst_stride_y) + j]) - ++err; - } - } + dst_y_c + (dst_stride_y * b) + b, dst_stride_y, + dst_u_c + (dst_stride_uv * b) + b, dst_stride_uv, + dst_v_c + (dst_stride_uv * b) + b, dst_stride_uv, + dst_width, dst_height, f); + } + c_time = (get_time() - c_time) / benchmark_iterations; - value = dst_u[(dst_stride_uv * b) + b]; + MaskCpuFlags(-1); // Enable all CPU optimization. + double opt_time = get_time(); + for (i = 0; i < benchmark_iterations; ++i) { + I420Scale(src_y + (src_stride_y * b) + b, src_stride_y, + src_u + (src_stride_uv * b) + b, src_stride_uv, + src_v + (src_stride_uv * b) + b, src_stride_uv, + src_width, src_height, + dst_y_opt + (dst_stride_y * b) + b, dst_stride_y, + dst_u_opt + (dst_stride_uv * b) + b, dst_stride_uv, + dst_v_opt + (dst_stride_uv * b) + b, dst_stride_uv, + dst_width, dst_height, f); + } + opt_time = (get_time() - opt_time) / benchmark_iterations; + + // Report performance of C vs OPT + printf("filter %d - %8d us C - %8d us OPT\n", + f, static_cast<int>(c_time*1e6), static_cast<int>(opt_time*1e6)); - if (value == 0) - ++err; + // C version may be a little off from the optimized. Order of + // operations may introduce rounding somewhere. So do a difference + // of the buffers and look to see that the max difference isn't + // over 2. + int max_diff = 0; + for (i = b; i < (dst_height + b); ++i) { + for (j = b; j < (dst_width + b); ++j) { + int abs_diff = abs(dst_y_c[(i * dst_stride_y) + j] - + dst_y_opt[(i * dst_stride_y) + j]); + if (abs_diff > max_diff) { + max_diff = abs_diff; + } + } + } - for (i = b; i < (dst_height_uv + b); ++i) { - for (j = b; j < (dst_width_uv + b); ++j) { - if (value != dst_u[(i * dst_stride_uv) + j]) - ++err; - if (value != dst_v[(i * dst_stride_uv) + j]) - ++err; + for (i = b; i < (dst_height_uv + b); ++i) { + for (j = b; j < (dst_width_uv + b); ++j) { + int abs_diff = abs(dst_u_c[(i * dst_stride_uv) + j] - + dst_u_opt[(i * dst_stride_uv) + j]); + if (abs_diff > max_diff) { + max_diff = abs_diff; + } + abs_diff = abs(dst_v_c[(i * dst_stride_uv) + j] - + dst_v_opt[(i * dst_stride_uv) + j]); + if (abs_diff > max_diff) { + max_diff = abs_diff; } } } - free_aligned_buffer_16(src_y) - free_aligned_buffer_16(src_u) - free_aligned_buffer_16(src_v) - free_aligned_buffer_16(dst_y) - free_aligned_buffer_16(dst_u) - free_aligned_buffer_16(dst_v) + free_aligned_buffer_page_end(dst_y_c) + free_aligned_buffer_page_end(dst_u_c) + free_aligned_buffer_page_end(dst_v_c) + free_aligned_buffer_page_end(dst_y_opt) + free_aligned_buffer_page_end(dst_u_opt) + free_aligned_buffer_page_end(dst_v_opt) + + free_aligned_buffer_page_end(src_y) + free_aligned_buffer_page_end(src_u) + free_aligned_buffer_page_end(src_v) + + return max_diff; +} + +TEST_F(libyuvTest, ScaleDownBy2) { + const int src_width = 1280; + const int src_height = 720; + const int dst_width = src_width / 2; + const int dst_height = src_height / 2; + + for (int f = 0; f < 3; ++f) { + int max_diff = TestFilter(src_width, src_height, + dst_width, dst_height, + static_cast<FilterMode>(f), 1, + benchmark_iterations_); + EXPECT_LE(max_diff, 1); + } +} + +TEST_F(libyuvTest, ScaleDownBy4) { + const int src_width = 1280; + const int src_height = 720; + const int dst_width = src_width / 4; + const int dst_height = src_height / 4; + + for (int f = 0; f < 3; ++f) { + int max_diff = TestFilter(src_width, src_height, + dst_width, dst_height, + static_cast<FilterMode>(f), 1, + benchmark_iterations_); + EXPECT_LE(max_diff, 2); // This is the only scale factor with error of 2. + } +} + +TEST_F(libyuvTest, ScaleDownBy5) { + const int src_width = 1280; + const int src_height = 720; + const int dst_width = src_width / 5; + const int dst_height = src_height / 5; + + for (int f = 0; f < 3; ++f) { + int max_diff = TestFilter(src_width, src_height, + dst_width, dst_height, + static_cast<FilterMode>(f), 1, + benchmark_iterations_); + EXPECT_LE(max_diff, 1); + } +} + +TEST_F(libyuvTest, ScaleDownBy8) { + const int src_width = 1280; + const int src_height = 720; + const int dst_width = src_width / 8; + const int dst_height = src_height / 8; + + for (int f = 0; f < 3; ++f) { + int max_diff = TestFilter(src_width, src_height, + dst_width, dst_height, + static_cast<FilterMode>(f), 1, + benchmark_iterations_); + EXPECT_LE(max_diff, 1); + } +} + +TEST_F(libyuvTest, ScaleDownBy16) { + const int src_width = 1280; + const int src_height = 720; + const int dst_width = src_width / 16; + const int dst_height = src_height / 16; + + for (int f = 0; f < 3; ++f) { + int max_diff = TestFilter(src_width, src_height, + dst_width, dst_height, + static_cast<FilterMode>(f), 1, + benchmark_iterations_); + EXPECT_LE(max_diff, 1); + } +} + +TEST_F(libyuvTest, ScaleDownBy34) { + const int src_width = 1280; + const int src_height = 720; + const int dst_width = src_width * 3 / 4; + const int dst_height = src_height * 3 / 4; + + for (int f = 0; f < 3; ++f) { + int max_diff = TestFilter(src_width, src_height, + dst_width, dst_height, + static_cast<FilterMode>(f), 1, + benchmark_iterations_); + EXPECT_LE(max_diff, 1); + } +} + +TEST_F(libyuvTest, ScaleDownBy38) { + int src_width = 1280; + int src_height = 720; + int dst_width = src_width * 3 / 8; + int dst_height = src_height * 3 / 8; + + for (int f = 0; f < 3; ++f) { + int max_diff = TestFilter(src_width, src_height, + dst_width, dst_height, + static_cast<FilterMode>(f), 1, + benchmark_iterations_); + EXPECT_LE(max_diff, 1); + } +} + +TEST_F(libyuvTest, ScaleTo1366) { + int src_width = 1280; + int src_height = 720; + int dst_width = 1366; + int dst_height = 768; + + for (int f = 0; f < 3; ++f) { + int max_diff = TestFilter(src_width, src_height, + dst_width, dst_height, + static_cast<FilterMode>(f), 1, + benchmark_iterations_); + EXPECT_LE(max_diff, 1); + } +} + +TEST_F(libyuvTest, ScaleTo4074) { + int src_width = 2880 * 2; + int src_height = 1800; + int dst_width = 4074; + int dst_height = 1272; + + for (int f = 0; f < 3; ++f) { + int max_diff = TestFilter(src_width, src_height, + dst_width, dst_height, + static_cast<FilterMode>(f), 1, + benchmark_iterations_); + EXPECT_LE(max_diff, 1); + } +} + +TEST_F(libyuvTest, ScaleTo853) { + int src_width = 1280; + int src_height = 720; + int dst_width = 853; + int dst_height = 480; + + for (int f = 0; f < 3; ++f) { + int max_diff = TestFilter(src_width, src_height, + dst_width, dst_height, + static_cast<FilterMode>(f), 1, + benchmark_iterations_); + EXPECT_LE(max_diff, 1); + } +} + +TEST_F(libyuvTest, ScaleTo853Wrong) { + int src_width = 1280; + int src_height = 720; + int dst_width = 853; + int dst_height = 480; - EXPECT_EQ(0, err); + for (int f = 0; f < 3; ++f) { + int max_diff = TestFilter(src_width, src_height, + dst_width, dst_height, + static_cast<FilterMode>(f), 0, + benchmark_iterations_); + EXPECT_LE(max_diff, 1); + } } + +// A one off test for a screen cast resolution scale. +TEST_F(libyuvTest, ScaleTo684) { + int src_width = 686; + int src_height = 557; + int dst_width = 684; + int dst_height = 552; + + for (int f = 0; f < 3; ++f) { + int max_diff = TestFilter(src_width, src_height, + dst_width, dst_height, + static_cast<FilterMode>(f), 1, + benchmark_iterations_); + EXPECT_LE(max_diff, 1); + } +} + +TEST_F(libyuvTest, ScaleTo342) { + int src_width = 686; + int src_height = 557; + int dst_width = 342; + int dst_height = 276; + + for (int f = 0; f < 3; ++f) { + int max_diff = TestFilter(src_width, src_height, + dst_width, dst_height, + static_cast<FilterMode>(f), 1, + benchmark_iterations_); + EXPECT_LE(max_diff, 1); + } +} + +TEST_F(libyuvTest, ScaleToHalf342) { + int src_width = 684; + int src_height = 552; + int dst_width = 342; + int dst_height = 276; + + for (int f = 0; f < 3; ++f) { + int max_diff = TestFilter(src_width, src_height, + dst_width, dst_height, + static_cast<FilterMode>(f), 1, + benchmark_iterations_); + EXPECT_LE(max_diff, 1); + } +} + +} // namespace libyuv diff --git a/files/unit_test/testdata/arm_v7.txt b/files/unit_test/testdata/arm_v7.txt new file mode 100644 index 00000000..5d7dbd04 --- /dev/null +++ b/files/unit_test/testdata/arm_v7.txt @@ -0,0 +1,12 @@ +Processor : ARMv7 Processor rev 5 (v7l) +BogoMIPS : 795.44 +Features : swp half thumb fastmult vfp edsp iwmmxt thumbee vfpv3 vfpv3d16 +CPU implementer : 0x56 +CPU architecture: 7 +CPU variant : 0x0 +CPU part : 0x581 +CPU revision : 5 + +Hardware : OLPC XO-1.75 +Revision : 0000 +Serial : 0000000000000000 diff --git a/files/unit_test/testdata/tegra3.txt b/files/unit_test/testdata/tegra3.txt new file mode 100644 index 00000000..d1b09f6b --- /dev/null +++ b/files/unit_test/testdata/tegra3.txt @@ -0,0 +1,23 @@ +Processor : ARMv7 Processor rev 9 (v7l) +processor : 0 +BogoMIPS : 1992.29 + +processor : 1 +BogoMIPS : 1992.29 + +processor : 2 +BogoMIPS : 1992.29 + +processor : 3 +BogoMIPS : 1992.29 + +Features : swp half thumb fastmult vfp edsp neon vfpv3 +CPU implementer : 0×41 +CPU architecture: 7 +CPU variant : 0×2 +CPU part : 0xc09 +CPU revision : 9 + +Hardware : cardhu +Revision : 0000 + diff --git a/files/unit_test/unit_test.cc b/files/unit_test/unit_test.cc index 1996adf1..007c81f0 100644 --- a/files/unit_test/unit_test.cc +++ b/files/unit_test/unit_test.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -8,33 +8,26 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include <cstring> -#include "unit_test.h" - -class libyuvEnvironment : public ::testing::Environment { - public: - virtual void SetUp() { - } +#include "../unit_test/unit_test.h" - virtual void TearDown() { - } -}; - -libyuvTest::libyuvTest() : - _rotate_max_w(128), - _rotate_max_h(128) { -} +#include <stdlib.h> // For getenv() -void libyuvTest::SetUp() { -} +#include <cstring> -void libyuvTest::TearDown() { +// Change this to 1000 for benchmarking. +// TODO(fbarchard): Add command line parsing to pass this as option. +#define BENCHMARK_ITERATIONS 1 + +libyuvTest::libyuvTest() : rotate_max_w_(128), rotate_max_h_(128), + benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(1280), + benchmark_height_(720) { + const char* repeat = getenv("LIBYUV_REPEAT"); + if (repeat) { + benchmark_iterations_ = atoi(repeat); // NOLINT + } } int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); - libyuvEnvironment* env = new libyuvEnvironment; - ::testing::AddGlobalTestEnvironment(env); - return RUN_ALL_TESTS(); -}
\ No newline at end of file +} diff --git a/files/unit_test/unit_test.h b/files/unit_test/unit_test.h index cac30c72..62521e88 100644 --- a/files/unit_test/unit_test.h +++ b/files/unit_test/unit_test.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -8,20 +8,67 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef UINIT_TEST_H_ -#define UINIT_TEST_H_ +#ifndef UNIT_TEST_UNIT_TEST_H_ +#define UNIT_TEST_UNIT_TEST_H_ #include <gtest/gtest.h> +#define align_buffer_16(var, size) \ + uint8* var; \ + uint8* var##_mem; \ + var##_mem = reinterpret_cast<uint8*>(malloc((size) + 15)); \ + var = reinterpret_cast<uint8*> \ + ((reinterpret_cast<intptr_t>(var##_mem) + 15) & ~15); + +#define free_aligned_buffer_16(var) \ + free(var##_mem); \ + var = 0; + + +#define align_buffer_page_end(var, size) \ + uint8* var; \ + uint8* var##_mem; \ + var##_mem = reinterpret_cast<uint8*>(malloc(((size) + 4095) & ~4095)); \ + var = var##_mem + (-(size) & 4095); + +#define free_aligned_buffer_page_end(var) \ + free(var##_mem); \ + var = 0; + +#ifdef WIN32 +#include <windows.h> +static inline double get_time() { + LARGE_INTEGER t, f; + QueryPerformanceCounter(&t); + QueryPerformanceFrequency(&f); + return static_cast<double>(t.QuadPart) / static_cast<double>(f.QuadPart); +} + +#define random rand +#define srandom srand +#else + +#include <sys/time.h> +#include <sys/resource.h> + +static inline double get_time() { + struct timeval t; + struct timezone tzp; + gettimeofday(&t, &tzp); + return t.tv_sec + t.tv_usec * 1e-6; +} +#endif + class libyuvTest : public ::testing::Test { protected: libyuvTest(); - virtual void SetUp(); - virtual void TearDown(); - const int _rotate_max_w; - const int _rotate_max_h; + const int rotate_max_w_; + const int rotate_max_h_; + int benchmark_iterations_; + const int benchmark_width_; + const int benchmark_height_; }; -#endif // UNIT_TEST_H_ +#endif // UNIT_TEST_UNIT_TEST_H_ diff --git a/files/unit_test/version_test.cc b/files/unit_test/version_test.cc new file mode 100644 index 00000000..c53d754c --- /dev/null +++ b/files/unit_test/version_test.cc @@ -0,0 +1,42 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stdlib.h> +#include <string.h> + +#include "libyuv/basic_types.h" +#include "libyuv/version.h" +#include "../unit_test/unit_test.h" + +namespace libyuv { + +// Tests SVN version against include/libyuv/version.h +// SVN version is bumped by documentation changes as well as code. +// Although the versions should match, once checked in, a tolerance is allowed. +TEST_F(libyuvTest, TestVersion) { + EXPECT_GE(LIBYUV_VERSION, 169); // 169 is first version to support version. + printf("LIBYUV_VERSION %d\n", LIBYUV_VERSION); +#ifdef LIBYUV_SVNREVISION + const char *ver = strchr(LIBYUV_SVNREVISION, ':'); + if (ver) { + ++ver; + } else { + ver = LIBYUV_SVNREVISION; + } + int svn_revision = atoi(ver); // NOLINT + printf("LIBYUV_SVNREVISION %d\n", svn_revision); + EXPECT_NEAR(LIBYUV_VERSION, svn_revision, 3); // Allow version to be close. + if (LIBYUV_VERSION != svn_revision) { + printf("WARNING - Versions do not match.\n"); + } +#endif +} + +} // namespace libyuv diff --git a/files/util/compare.cc b/files/util/compare.cc new file mode 100644 index 00000000..f030c799 --- /dev/null +++ b/files/util/compare.cc @@ -0,0 +1,64 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> + +#include "libyuv/basic_types.h" +#include "libyuv/compare.h" +#include "libyuv/version.h" + +int main(int argc, char** argv) { + if (argc < 1) { + printf("libyuv compare v%d\n", LIBYUV_VERSION); + printf("compare file1.yuv file2.yuv\n"); + return -1; + } + char* name1 = argv[1]; + char* name2 = (argc > 2) ? argv[2] : NULL; + FILE* fin1 = fopen(name1, "rb"); + FILE* fin2 = name2 ? fopen(name2, "rb") : NULL; + + const int kBlockSize = 32768; + uint8 buf1[kBlockSize]; + uint8 buf2[kBlockSize]; + uint32 hash1 = 5381; + uint32 hash2 = 5381; + uint64 sum_square_err = 0; + uint64 size_min = 0; + int amt1 = 0; + int amt2 = 0; + do { + amt1 = fread(buf1, 1, kBlockSize, fin1); + if (amt1 > 0) hash1 = libyuv::HashDjb2(buf1, amt1, hash1); + if (fin2) { + amt2 = fread(buf2, 1, kBlockSize, fin2); + if (amt2 > 0) hash2 = libyuv::HashDjb2(buf2, amt2, hash2); + int amt_min = (amt1 < amt2) ? amt1 : amt2; + size_min += amt_min; + sum_square_err += libyuv::ComputeSumSquareError(buf1, buf2, amt_min); + } + } while (amt1 > 0 || amt2 > 0); + + printf("hash1 %x", hash1); + if (fin2) { + printf(", hash2 %x", hash2); + double mse = static_cast<double>(sum_square_err) / + static_cast<double>(size_min); + printf(", mse %.2f", mse); + double psnr = libyuv::SumSquareErrorToPsnr(sum_square_err, size_min); + printf(", psnr %.2f\n", psnr); + fclose(fin2); + } + fclose(fin1); +} + |