aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHendrik Dahlkamp <hendrik@google.com>2013-01-23 18:27:37 -0800
committerAdam Hampson <ahampson@google.com>2013-01-28 15:39:41 -0800
commit33cfdeb7b267ab635413797fffb046b73272f7ec (patch)
tree8ff16b765a83ba911233a1d7bfa27cce9cee3b7c
parenta88a10a6ed9f9801852929bac34bdf10510116f4 (diff)
downloadlibyuv-33cfdeb7b267ab635413797fffb046b73272f7ec.tar.gz
Update libyuv to r397
Change-Id: I70f5a527de52ae8ae80b189873c9a094035dfa2c Signed-off-by: Hendrik Dahlkamp <hendrik@google.com>
-rw-r--r--Android.mk25
-rw-r--r--README.google7
-rw-r--r--files/AUTHORS4
-rw-r--r--files/codereview.settings12
-rw-r--r--files/include/libyuv.h17
-rw-r--r--files/include/libyuv/basic_types.h71
-rw-r--r--files/include/libyuv/compare.h73
-rw-r--r--files/include/libyuv/convert.h257
-rw-r--r--files/include/libyuv/convert_argb.h228
-rw-r--r--files/include/libyuv/convert_from.h165
-rw-r--r--files/include/libyuv/cpu_id.h59
-rw-r--r--files/include/libyuv/format_conversion.h173
-rw-r--r--files/include/libyuv/general.h47
-rw-r--r--files/include/libyuv/mjpeg_decoder.h188
-rw-r--r--files/include/libyuv/planar_functions.h344
-rw-r--r--files/include/libyuv/rotate.h88
-rw-r--r--files/include/libyuv/rotate_argb.h33
-rw-r--r--files/include/libyuv/row.h731
-rw-r--r--files/include/libyuv/scale.h47
-rw-r--r--files/include/libyuv/scale_argb.h34
-rw-r--r--files/include/libyuv/version.h16
-rw-r--r--files/include/libyuv/video_common.h159
-rw-r--r--files/libyuv.gyp89
-rwxr-xr-xfiles/libyuv_test.gyp74
-rw-r--r--files/source/compare.cc571
-rw-r--r--files/source/compare_neon.cc62
-rw-r--r--files/source/conversion_tables.h8
-rw-r--r--files/source/convert.cc2627
-rw-r--r--files/source/convert_argb.cc1300
-rw-r--r--files/source/convert_from.cc1425
-rw-r--r--files/source/cpu_id.cc206
-rw-r--r--files/source/format_conversion.cc561
-rw-r--r--files/source/general.cc284
-rw-r--r--files/source/mjpeg_decoder.cc583
-rw-r--r--files/source/planar_functions.cc2411
-rw-r--r--files/source/rotate.cc1350
-rw-r--r--files/source/rotate_argb.cc175
-rw-r--r--files/source/rotate_neon.cc406
-rw-r--r--files/source/rotate_neon.s563
-rw-r--r--files/source/rotate_priv.h72
-rw-r--r--files/source/row.h167
-rw-r--r--files/source/row_common.cc1246
-rw-r--r--files/source/row_neon.cc829
-rw-r--r--files/source/row_posix.cc4211
-rw-r--r--files/source/row_table.cc469
-rw-r--r--files/source/row_win.cc4087
-rw-r--r--files/source/scale.cc4051
-rw-r--r--files/source/scale_argb.cc1035
-rw-r--r--files/source/scale_neon.cc534
-rw-r--r--files/source/video_common.cc19
-rw-r--r--files/source/video_common.h82
-rw-r--r--files/unit_test/compare_test.cc450
-rw-r--r--files/unit_test/cpu_test.cc100
-rw-r--r--files/unit_test/planar_test.cc1005
-rw-r--r--files/unit_test/rotate_argb_test.cc195
-rw-r--r--files/unit_test/rotate_test.cc1194
-rw-r--r--files/unit_test/scale_argb_test.cc255
-rw-r--r--files/unit_test/scale_test.cc447
-rw-r--r--files/unit_test/testdata/arm_v7.txt12
-rw-r--r--files/unit_test/testdata/tegra3.txt23
-rw-r--r--files/unit_test/unit_test.cc39
-rw-r--r--files/unit_test/unit_test.h63
-rw-r--r--files/unit_test/version_test.cc42
-rw-r--r--files/util/compare.cc64
64 files changed, 27342 insertions, 8822 deletions
diff --git a/Android.mk b/Android.mk
index 626f7a1f..d1c565b1 100644
--- a/Android.mk
+++ b/Android.mk
@@ -5,19 +5,32 @@ ifeq ($(TARGET_ARCH),arm)
LOCAL_PATH := $(call my-dir)
common_SRC_FILES := \
+ files/source/compare.cc \
files/source/convert.cc \
+ files/source/convert_argb.cc \
+ files/source/convert_from.cc \
+ files/source/cpu_id.cc \
files/source/format_conversion.cc \
files/source/planar_functions.cc \
- files/source/row_posix.cc \
- files/source/video_common.cc \
- files/source/cpu_id.cc \
- files/source/general.cc \
files/source/rotate.cc \
- files/source/row_table.cc \
- files/source/scale.cc
+ files/source/rotate_argb.cc \
+ files/source/row_common.cc \
+ files/source/row_posix.cc \
+ files/source/scale.cc \
+ files/source/scale_argb.cc \
+ files/source/video_common.cc
common_CFLAGS := -Wall -fexceptions
+ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
+ common_CFLAGS += -DLIBYUV_NEON
+ common_SRC_FILES += \
+ files/source/compare_neon.cc.neon \
+ files/source/rotate_neon.cc.neon \
+ files/source/row_neon.cc.neon \
+ files/source/scale_neon.cc.neon
+endif
+
common_C_INCLUDES = $(LOCAL_PATH)/files/include
# For the device
diff --git a/README.google b/README.google
index 79828ab8..c887302d 100644
--- a/README.google
+++ b/README.google
@@ -1,10 +1,11 @@
-URL: http://libyuv.googlecode.com/svn-history/r52/trunk/
-Version: r52
+URL: http://libyuv.googlecode.com/svn-history/r397/trunk/
+Version: r397
License: BSD
License File: LICENSE
Description:
-libyuv is an open-source library for yuv conversion and scaling.
+libyuv is an open-source library for yuv scaling, conversion, comparison
+and rendering.
Specifically libyuv is optimized for SSE2/SSSE3 and Neon and has demonstrated
speed up to 10x to 16x compared to C code.
diff --git a/files/AUTHORS b/files/AUTHORS
new file mode 100644
index 00000000..9686ac13
--- /dev/null
+++ b/files/AUTHORS
@@ -0,0 +1,4 @@
+# Names should be added to this file like so:
+# Name or Organization <email address>
+
+Google Inc.
diff --git a/files/codereview.settings b/files/codereview.settings
new file mode 100644
index 00000000..11270bba
--- /dev/null
+++ b/files/codereview.settings
@@ -0,0 +1,12 @@
+# This file is used by gcl to get repository specific information.
+# The LibYuv code review is via WebRtc's code review
+CODE_REVIEW_SERVER: webrtc-codereview.appspot.com
+#CC_LIST:
+#VIEW_VC:
+#STATUS:
+TRY_ON_UPLOAD: False
+TRYSERVER_HTTP_HOST: webrtc-cb-linux-master.cbf.corp.google.com
+TRYSERVER_HTTP_PORT: 9020
+#TRYSERVER_SVN_URL:
+#GITCL_PREUPLOAD:
+#GITCL_PREDCOMMIT:
diff --git a/files/include/libyuv.h b/files/include/libyuv.h
index 5a30e2d0..06f26aae 100644
--- a/files/include/libyuv.h
+++ b/files/include/libyuv.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@@ -8,17 +8,22 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-
-#ifndef LIBYUV_INCLUDE_LIBYUV_H_
-#define LIBYUV_INCLUDE_LIBYUV_H_
+#ifndef INCLUDE_LIBYUV_H_ // NOLINT
+#define INCLUDE_LIBYUV_H_
#include "libyuv/basic_types.h"
+#include "libyuv/compare.h"
#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+#include "libyuv/convert_from.h"
#include "libyuv/cpu_id.h"
#include "libyuv/format_conversion.h"
-#include "libyuv/general.h"
#include "libyuv/planar_functions.h"
#include "libyuv/rotate.h"
+#include "libyuv/rotate_argb.h"
#include "libyuv/scale.h"
+#include "libyuv/scale_argb.h"
+#include "libyuv/version.h"
+#include "libyuv/video_common.h"
-#endif // LIBYUV_INCLUDE_LIBYUV_H_
+#endif // INCLUDE_LIBYUV_H_ NOLINT
diff --git a/files/include/libyuv/basic_types.h b/files/include/libyuv/basic_types.h
index 5adc2bfd..9e9f2abc 100644
--- a/files/include/libyuv/basic_types.h
+++ b/files/include/libyuv/basic_types.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@@ -8,27 +8,18 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_
+#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_ // NOLINT
#define INCLUDE_LIBYUV_BASIC_TYPES_H_
#include <stddef.h> // for NULL, size_t
-#ifndef WIN32
+#if !(defined(_MSC_VER) && (_MSC_VER < 1600))
#include <stdint.h> // for uintptr_t
#endif
#ifndef INT_TYPES_DEFINED
#define INT_TYPES_DEFINED
#ifdef COMPILER_MSVC
-typedef __int64 int64;
-#else
-typedef long long int64;
-#endif /* COMPILER_MSVC */
-typedef int int32;
-typedef short int16;
-typedef char int8;
-
-#ifdef COMPILER_MSVC
typedef unsigned __int64 uint64;
typedef __int64 int64;
#ifndef INT64_C
@@ -38,9 +29,20 @@ typedef __int64 int64;
#define UINT64_C(x) x ## UI64
#endif
#define INT64_F "I64"
-#else
-typedef unsigned long long uint64;
-typedef long long int64;
+#else // COMPILER_MSVC
+#ifdef __LP64__
+typedef unsigned long uint64; // NOLINT
+typedef long int64; // NOLINT
+#ifndef INT64_C
+#define INT64_C(x) x ## L
+#endif
+#ifndef UINT64_C
+#define UINT64_C(x) x ## UL
+#endif
+#define INT64_F "l"
+#else // __LP64__
+typedef unsigned long long uint64; // NOLINT
+typedef long long int64; // NOLINT
#ifndef INT64_C
#define INT64_C(x) x ## LL
#endif
@@ -48,10 +50,14 @@ typedef long long int64;
#define UINT64_C(x) x ## ULL
#endif
#define INT64_F "ll"
-#endif /* COMPILER_MSVC */
+#endif // __LP64__
+#endif // COMPILER_MSVC
typedef unsigned int uint32;
-typedef unsigned short uint16;
+typedef int int32;
+typedef unsigned short uint16; // NOLINT
+typedef short int16; // NOLINT
typedef unsigned char uint8;
+typedef signed char int8;
#endif // INT_TYPES_DEFINED
// Detect compiler is for x86 or x64.
@@ -59,10 +65,33 @@ typedef unsigned char uint8;
defined(__i386__) || defined(_M_IX86)
#define CPU_X86 1
#endif
+// Detect compiler is for ARM.
+#if defined(__arm__) || defined(_M_ARM)
+#define CPU_ARM 1
+#endif
-#define IS_ALIGNED(p, a) (0==(reinterpret_cast<uintptr_t>(p) & ((a)-1)))
+#ifndef ALIGNP
#define ALIGNP(p, t) \
- (reinterpret_cast<uint8*>(((reinterpret_cast<uintptr_t>(p) + \
- ((t)-1)) & ~((t)-1))))
+ (reinterpret_cast<uint8*>(((reinterpret_cast<uintptr_t>(p) + \
+ ((t) - 1)) & ~((t) - 1))))
+#endif
+
+#if !defined(LIBYUV_API)
+#if defined(_WIN32) || defined(__CYGWIN__)
+#if defined(LIBYUV_BUILDING_SHARED_LIBRARY)
+#define LIBYUV_API __declspec(dllexport)
+#elif defined(LIBYUV_USING_SHARED_LIBRARY)
+#define LIBYUV_API __declspec(dllimport)
+#else
+#define LIBYUV_API
+#endif // LIBYUV_BUILDING_SHARED_LIBRARY
+#elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__APPLE__) && \
+ (defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \
+ defined(LIBYUV_USING_SHARED_LIBRARY))
+#define LIBYUV_API __attribute__ ((visibility ("default")))
+#else
+#define LIBYUV_API
+#endif // __GNUC__
+#endif // LIBYUV_API
-#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_
+#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_ NOLINT
diff --git a/files/include/libyuv/compare.h b/files/include/libyuv/compare.h
new file mode 100644
index 00000000..5fd924b8
--- /dev/null
+++ b/files/include/libyuv/compare.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_COMPARE_H_ // NOLINT
+#define INCLUDE_LIBYUV_COMPARE_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Compute a hash for specified memory. Seed of 5381 recommended.
+LIBYUV_API
+uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed);
+
+// Sum Square Error - used to compute Mean Square Error or PSNR.
+LIBYUV_API
+uint64 ComputeSumSquareError(const uint8* src_a,
+ const uint8* src_b, int count);
+
+LIBYUV_API
+uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
+ const uint8* src_b, int stride_b,
+ int width, int height);
+
+static const int kMaxPsnr = 128;
+
+LIBYUV_API
+double SumSquareErrorToPsnr(uint64 sse, uint64 count);
+
+LIBYUV_API
+double CalcFramePsnr(const uint8* src_a, int stride_a,
+ const uint8* src_b, int stride_b,
+ int width, int height);
+
+LIBYUV_API
+double I420Psnr(const uint8* src_y_a, int stride_y_a,
+ const uint8* src_u_a, int stride_u_a,
+ const uint8* src_v_a, int stride_v_a,
+ const uint8* src_y_b, int stride_y_b,
+ const uint8* src_u_b, int stride_u_b,
+ const uint8* src_v_b, int stride_v_b,
+ int width, int height);
+
+LIBYUV_API
+double CalcFrameSsim(const uint8* src_a, int stride_a,
+ const uint8* src_b, int stride_b,
+ int width, int height);
+
+LIBYUV_API
+double I420Ssim(const uint8* src_y_a, int stride_y_a,
+ const uint8* src_u_a, int stride_u_a,
+ const uint8* src_v_a, int stride_v_a,
+ const uint8* src_y_b, int stride_y_b,
+ const uint8* src_u_b, int stride_u_b,
+ const uint8* src_v_b, int stride_v_b,
+ int width, int height);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_COMPARE_H_ NOLINT
diff --git a/files/include/libyuv/convert.h b/files/include/libyuv/convert.h
index fa3b6446..1d4b6a5b 100644
--- a/files/include/libyuv/convert.h
+++ b/files/include/libyuv/convert.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@@ -8,90 +8,243 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-
-#ifndef INCLUDE_LIBYUV_CONVERT_H_
+#ifndef INCLUDE_LIBYUV_CONVERT_H_ // NOLINT
#define INCLUDE_LIBYUV_CONVERT_H_
#include "libyuv/basic_types.h"
+// TODO(fbarchard): Remove the following headers includes.
+#include "libyuv/convert_from.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#ifdef __cplusplus
namespace libyuv {
+extern "C" {
+#endif
+
+// Alias.
+#define I420ToI420 I420Copy
+
+// Copy I420 to I420.
+LIBYUV_API
+int I420Copy(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert I422 to I420.
+LIBYUV_API
+int I422ToI420(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
-int I420ToRGB24(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_frame, int dst_stride_frame,
- int width, int height);
-
-int I420ToARGB4444(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_frame, int dst_stride_frame,
- int width, int height);
-
-int I420ToRGB565(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_frame, int dst_stride_frame,
- int width, int height);
-
-int I420ToARGB1555(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_frame, int dst_stride_frame,
- int width, int height);
-
-int I420ToYUY2(const uint8* src_y, int src_stride_y,
+// Convert I444 to I420.
+LIBYUV_API
+int I444ToI420(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
- uint8* dst_frame, int dst_stride_frame,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
int width, int height);
-int I420ToUYVY(const uint8* src_y, int src_stride_y,
+// Convert I411 to I420.
+LIBYUV_API
+int I411ToI420(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
- uint8* dst_frame, int dst_stride_frame,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
int width, int height);
-// TODO(fbarchard): Deprecated - this is same as BG24ToARGB with -height
-int RGB24ToARGB(const uint8* src_frame, int src_stride_frame,
- uint8* dst_frame, int dst_stride_frame,
- int width, int height);
+// Convert I400 (grey) to I420.
+LIBYUV_API
+int I400ToI420(const uint8* src_y, int src_stride_y,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
-int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
+// Convert NV12 to I420. Also used for NV21.
+LIBYUV_API
+int NV12ToI420(const uint8* src_y, int src_stride_y,
+ const uint8* src_uv, int src_stride_uv,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
-int RAWToI420(const uint8* src_frame, int src_stride_frame,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
+// Convert M420 to I420.
+LIBYUV_API
+int M420ToI420(const uint8* src_m420, int src_stride_m420,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
-int ABGRToI420(const uint8* src_frame, int src_stride_frame,
+// Convert Q420 to I420.
+LIBYUV_API
+int Q420ToI420(const uint8* src_y, int src_stride_y,
+ const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
-int BGRAToI420(const uint8* src_frame, int src_stride_frame,
+// Convert YUY2 to I420.
+LIBYUV_API
+int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
+// Convert UYVY to I420.
+LIBYUV_API
+int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert V210 to I420.
+LIBYUV_API
+int V210ToI420(const uint8* src_uyvy, int src_stride_uyvy,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// ARGB little endian (bgra in memory) to I420.
+LIBYUV_API
int ARGBToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
-int NV12ToRGB565(const uint8* src_y, int src_stride_y,
- const uint8* src_uv, int src_stride_uv,
- uint8* dst_frame, int dst_stride_frame,
+// BGRA little endian (argb in memory) to I420.
+LIBYUV_API
+int BGRAToI420(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// ABGR little endian (rgba in memory) to I420.
+LIBYUV_API
+int ABGRToI420(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// RGBA little endian (abgr in memory) to I420.
+LIBYUV_API
+int RGBAToI420(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// RGB little endian (bgr in memory) to I420.
+LIBYUV_API
+int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// RGB big endian (rgb in memory) to I420.
+LIBYUV_API
+int RAWToI420(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// RGB16 (RGBP fourcc) little endian to I420.
+LIBYUV_API
+int RGB565ToI420(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
int width, int height);
-} // namespace libyuv
+// RGB15 (RGBO fourcc) little endian to I420.
+LIBYUV_API
+int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
-#endif // INCLUDE_LIBYUV_CONVERT_H_
+// RGB12 (R444 fourcc) little endian to I420.
+LIBYUV_API
+int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+#ifdef HAVE_JPEG
+// src_width/height provided by capture.
+// dst_width/height for clipping determine final size.
+LIBYUV_API
+int MJPGToI420(const uint8* sample, size_t sample_size,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int src_width, int src_height,
+ int dst_width, int dst_height);
+#endif
+
+// Note Bayer formats (BGGR) To I420 are in format_conversion.h
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// "src_size" is needed to parse MJPG.
+// "dst_stride_y" number of bytes in a row of the dst_y plane.
+// Normally this would be the same as dst_width, with recommended alignment
+// to 16 bytes for better efficiency.
+// If rotation of 90 or 270 is used, stride is affected. The caller should
+// allocate the I420 buffer according to rotation.
+// "dst_stride_u" number of bytes in a row of the dst_u plane.
+// Normally this would be the same as (dst_width + 1) / 2, with
+// recommended alignment to 16 bytes for better efficiency.
+// If rotation of 90 or 270 is used, stride is affected.
+// "crop_x" and "crop_y" are starting position for cropping.
+// To center, crop_x = (src_width - dst_width) / 2
+// crop_y = (src_height - dst_height) / 2
+// "src_width" / "src_height" is size of src_frame in pixels.
+// "src_height" can be negative indicating a vertically flipped image source.
+// "dst_width" / "dst_height" is size of destination to crop to.
+// Must be less than or equal to src_width/src_height
+// Cropping parameters are pre-rotation.
+// "rotation" can be 0, 90, 180 or 270.
+// "format" is a fourcc. ie 'I420', 'YUY2'
+// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
+LIBYUV_API
+int ConvertToI420(const uint8* src_frame, size_t src_size,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int crop_x, int crop_y,
+ int src_width, int src_height,
+ int dst_width, int dst_height,
+ RotationMode rotation,
+ uint32 format);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_CONVERT_H_ NOLINT
diff --git a/files/include/libyuv/convert_argb.h b/files/include/libyuv/convert_argb.h
new file mode 100644
index 00000000..86085252
--- /dev/null
+++ b/files/include/libyuv/convert_argb.h
@@ -0,0 +1,228 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_ // NOLINT
+#define INCLUDE_LIBYUV_CONVERT_ARGB_H_
+
+#include "libyuv/basic_types.h"
+// TODO(fbarchard): Remove the following headers includes
+#include "libyuv/convert_from.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+
+// TODO(fbarchard): This set of functions should exactly match convert.h
+// Add missing V210 and Q420.
+// TODO(fbarchard): Add tests. Create random content of right size and convert
+// with C vs Opt and or to I420 and compare.
+// TODO(fbarchard): Some of these functions lack parameter setting.
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Alias.
+#define ARGBToARGB ARGBCopy
+
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert I420 to ARGB.
+LIBYUV_API
+int I420ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert I422 to ARGB.
+LIBYUV_API
+int I422ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert I411 to ARGB.
+LIBYUV_API
+int I411ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert I400 (grey) to ARGB.
+LIBYUV_API
+int I400ToARGB(const uint8* src_y, int src_stride_y,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert I400 to ARGB. Reverse of ARGBToI400.
+LIBYUV_API
+int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert NV12 to ARGB.
+LIBYUV_API
+int NV12ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_uv, int src_stride_uv,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert NV21 to ARGB.
+LIBYUV_API
+int NV21ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_vu, int src_stride_vu,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert M420 to ARGB.
+LIBYUV_API
+int M420ToARGB(const uint8* src_m420, int src_stride_m420,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// TODO(fbarchard): Convert Q420 to ARGB.
+// LIBYUV_API
+// int Q420ToARGB(const uint8* src_y, int src_stride_y,
+// const uint8* src_yuy2, int src_stride_yuy2,
+// uint8* dst_argb, int dst_stride_argb,
+// int width, int height);
+
+// Convert YUY2 to ARGB.
+LIBYUV_API
+int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert UYVY to ARGB.
+LIBYUV_API
+int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// TODO(fbarchard): Convert V210 to ARGB.
+// LIBYUV_API
+// int V210ToARGB(const uint8* src_uyvy, int src_stride_uyvy,
+// uint8* dst_argb, int dst_stride_argb,
+// int width, int height);
+
+// BGRA little endian (argb in memory) to ARGB.
+LIBYUV_API
+int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// ABGR little endian (rgba in memory) to ARGB.
+LIBYUV_API
+int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// RGBA little endian (abgr in memory) to ARGB.
+LIBYUV_API
+int RGBAToARGB(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Deprecated function name.
+#define BG24ToARGB RGB24ToARGB
+
+// RGB little endian (bgr in memory) to ARGB.
+LIBYUV_API
+int RGB24ToARGB(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// RGB big endian (rgb in memory) to ARGB.
+LIBYUV_API
+int RAWToARGB(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// RGB16 (RGBP fourcc) little endian to ARGB.
+LIBYUV_API
+int RGB565ToARGB(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// RGB15 (RGBO fourcc) little endian to ARGB.
+LIBYUV_API
+int ARGB1555ToARGB(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// RGB12 (R444 fourcc) little endian to ARGB.
+LIBYUV_API
+int ARGB4444ToARGB(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+#ifdef HAVE_JPEG
+// src_width/height provided by capture
+// dst_width/height for clipping determine final size.
+LIBYUV_API
+int MJPGToARGB(const uint8* sample, size_t sample_size,
+ uint8* dst_argb, int dst_stride_argb,
+ int src_width, int src_height,
+ int dst_width, int dst_height);
+#endif
+
+// Note Bayer formats (BGGR) to ARGB are in format_conversion.h.
+
+// Convert camera sample to ARGB with cropping, rotation and vertical flip.
+// "src_size" is needed to parse MJPG.
+// "dst_stride_argb" number of bytes in a row of the dst_argb plane.
+// Normally this would be the same as dst_width, with recommended alignment
+// to 16 bytes for better efficiency.
+// If rotation of 90 or 270 is used, stride is affected. The caller should
+// allocate the I420 buffer according to rotation.
+// "dst_stride_u" number of bytes in a row of the dst_u plane.
+// Normally this would be the same as (dst_width + 1) / 2, with
+// recommended alignment to 16 bytes for better efficiency.
+// If rotation of 90 or 270 is used, stride is affected.
+// "crop_x" and "crop_y" are starting position for cropping.
+// To center, crop_x = (src_width - dst_width) / 2
+// crop_y = (src_height - dst_height) / 2
+// "src_width" / "src_height" is size of src_frame in pixels.
+// "src_height" can be negative indicating a vertically flipped image source.
+// "dst_width" / "dst_height" is size of destination to crop to.
+// Must be less than or equal to src_width/src_height
+// Cropping parameters are pre-rotation.
+// "rotation" can be 0, 90, 180 or 270.
+// "format" is a fourcc. ie 'I420', 'YUY2'
+// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
+LIBYUV_API
+int ConvertToARGB(const uint8* src_frame, size_t src_size,
+ uint8* dst_argb, int dst_stride_argb,
+ int crop_x, int crop_y,
+ int src_width, int src_height,
+ int dst_width, int dst_height,
+ RotationMode rotation,
+ uint32 format);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_CONVERT_ARGB_H_ NOLINT
diff --git a/files/include/libyuv/convert_from.h b/files/include/libyuv/convert_from.h
new file mode 100644
index 00000000..4eae950c
--- /dev/null
+++ b/files/include/libyuv/convert_from.h
@@ -0,0 +1,165 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_ // NOLINT
+#define INCLUDE_LIBYUV_CONVERT_FROM_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/rotate.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// See Also convert.h for conversions from formats to I420.
+
+// I420Copy in convert to I420ToI420.
+
+LIBYUV_API
+int I420ToI422(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+LIBYUV_API
+int I420ToI444(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+LIBYUV_API
+int I420ToI411(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21.
+LIBYUV_API
+int I400Copy(const uint8* src_y, int src_stride_y,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height);
+
+// TODO(fbarchard): I420ToNV12
+// TODO(fbarchard): I420ToM420
+// TODO(fbarchard): I420ToQ420
+
+LIBYUV_API
+int I420ToYUY2(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+LIBYUV_API
+int I420ToUYVY(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+LIBYUV_API
+int I420ToV210(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+LIBYUV_API
+int I420ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+LIBYUV_API
+int I420ToBGRA(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+LIBYUV_API
+int I420ToABGR(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+LIBYUV_API
+int I420ToRGBA(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_rgba, int dst_stride_rgba,
+ int width, int height);
+
+LIBYUV_API
+int I420ToRGB24(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+LIBYUV_API
+int I420ToRAW(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+LIBYUV_API
+int I420ToRGB565(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+LIBYUV_API
+int I420ToARGB1555(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+LIBYUV_API
+int I420ToARGB4444(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+// Note Bayer formats (BGGR) To I420 are in format_conversion.h.
+
+// Convert I420 to specified format.
+// "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
+// buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.
+LIBYUV_API
+int ConvertFromI420(const uint8* y, int y_stride,
+ const uint8* u, int u_stride,
+ const uint8* v, int v_stride,
+ uint8* dst_sample, int dst_sample_stride,
+ int width, int height,
+ uint32 format);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_CONVERT_FROM_H_ NOLINT
diff --git a/files/include/libyuv/cpu_id.h b/files/include/libyuv/cpu_id.h
index c1000e86..0914f1d2 100644
--- a/files/include/libyuv/cpu_id.h
+++ b/files/include/libyuv/cpu_id.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@@ -8,28 +8,63 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef INCLUDE_LIBYUV_CPU_ID_H_
+#ifndef INCLUDE_LIBYUV_CPU_ID_H_ // NOLINT
#define INCLUDE_LIBYUV_CPU_ID_H_
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
namespace libyuv {
+extern "C" {
+#endif
-// These flags are only valid on x86 processors
-static const int kCpuHasSSE2 = 1;
-static const int kCpuHasSSSE3 = 2;
+// Internal flag to indicate cpuid is initialized.
+static const int kCpuInitialized = 0x1;
-// These flags are only valid on ARM processors
-static const int kCpuHasNEON = 4;
+// These flags are only valid on ARM processors.
+static const int kCpuHasARM = 0x2;
+static const int kCpuHasNEON = 0x4;
+// 0x8 reserved for future ARM flag.
-// Internal flag to indicate cpuid is initialized.
-static const int kCpuInitialized = 8;
+// These flags are only valid on x86 processors.
+static const int kCpuHasX86 = 0x10;
+static const int kCpuHasSSE2 = 0x20;
+static const int kCpuHasSSSE3 = 0x40;
+static const int kCpuHasSSE41 = 0x80;
+static const int kCpuHasSSE42 = 0x100;
+static const int kCpuHasAVX = 0x200;
+static const int kCpuHasAVX2 = 0x400;
+
+// Internal function used to auto-init.
+LIBYUV_API
+int InitCpuFlags(void);
+
+// Internal function for parsing /proc/cpuinfo.
+LIBYUV_API
+int ArmCpuCaps(const char* cpuinfo_name);
// Detect CPU has SSE2 etc.
-bool TestCpuFlag(int flag);
+// Test_flag parameter should be one of kCpuHas constants above.
+// returns non-zero if instruction set is detected
+static __inline int TestCpuFlag(int test_flag) {
+ LIBYUV_API extern int cpu_info_;
+ return (cpu_info_ ? cpu_info_ : InitCpuFlags()) & test_flag;
+}
// For testing, allow CPU flags to be disabled.
-// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3. -1 to enable all.
+// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
+// MaskCpuFlags(-1) to enable all cpu specific optimizations.
+// MaskCpuFlags(0) to disable all cpu specific optimizations.
+LIBYUV_API
void MaskCpuFlags(int enable_flags);
+// Low level cpuid for X86. Returns zeros on other CPUs.
+LIBYUV_API
+void CpuId(int cpu_info[4], int info_type);
+
+#ifdef __cplusplus
+} // extern "C"
} // namespace libyuv
+#endif
-#endif // INCLUDE_LIBYUV_CPU_ID_H_
+#endif // INCLUDE_LIBYUV_CPU_ID_H_ NOLINT
diff --git a/files/include/libyuv/format_conversion.h b/files/include/libyuv/format_conversion.h
index d3d36f38..06bd387f 100644
--- a/files/include/libyuv/format_conversion.h
+++ b/files/include/libyuv/format_conversion.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@@ -8,34 +8,161 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-
-#ifndef INCLUDE_LIBYUV_FORMATCONVERSION_H_
+#ifndef INCLUDE_LIBYUV_FORMATCONVERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_FORMATCONVERSION_H_
#include "libyuv/basic_types.h"
+#ifdef __cplusplus
namespace libyuv {
+extern "C" {
+#endif
+
+// Convert Bayer RGB formats to I420.
+LIBYUV_API
+int BayerBGGRToI420(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+LIBYUV_API
+int BayerGBRGToI420(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+LIBYUV_API
+int BayerGRBGToI420(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+LIBYUV_API
+int BayerRGGBToI420(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Temporary API mapper.
+#define BayerRGBToI420(b, bs, f, y, ys, u, us, v, vs, w, h) \
+ BayerToI420(b, bs, y, ys, u, us, v, vs, w, h, f)
+
+LIBYUV_API
+int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height,
+ uint32 src_fourcc_bayer);
+
+// Convert I420 to Bayer RGB formats.
+LIBYUV_API
+int I420ToBayerBGGR(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+LIBYUV_API
+int I420ToBayerGBRG(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+LIBYUV_API
+int I420ToBayerGRBG(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+LIBYUV_API
+int I420ToBayerRGGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+// Temporary API mapper.
+#define I420ToBayerRGB(y, ys, u, us, v, vs, b, bs, f, w, h) \
+ I420ToBayer(y, ys, u, us, v, vs, b, bs, w, h, f)
+
+LIBYUV_API
+int I420ToBayer(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height,
+ uint32 dst_fourcc_bayer);
+
+// Convert Bayer RGB formats to ARGB.
+LIBYUV_API
+int BayerBGGRToARGB(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+LIBYUV_API
+int BayerGBRGToARGB(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+LIBYUV_API
+int BayerGRBGToARGB(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+LIBYUV_API
+int BayerRGGBToARGB(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Temporary API mapper.
+#define BayerRGBToARGB(b, bs, f, a, as, w, h) BayerToARGB(b, bs, a, as, w, h, f)
+
+LIBYUV_API
+int BayerToARGB(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height,
+ uint32 src_fourcc_bayer);
+
+// Converts ARGB to Bayer RGB formats.
+LIBYUV_API
+int ARGBToBayerBGGR(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_bayer, int dst_stride_bayer,
+ int width, int height);
+
+LIBYUV_API
+int ARGBToBayerGBRG(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_bayer, int dst_stride_bayer,
+ int width, int height);
+
+LIBYUV_API
+int ARGBToBayerGRBG(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_bayer, int dst_stride_bayer,
+ int width, int height);
+
+LIBYUV_API
+int ARGBToBayerRGGB(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_bayer, int dst_stride_bayer,
+ int width, int height);
+
+// Temporary API mapper.
+#define ARGBToBayerRGB(a, as, b, bs, f, w, h) ARGBToBayer(b, bs, a, as, w, h, f)
-// Converts any Bayer RGB format to I420.
-int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
- uint32 src_fourcc_bayer,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
-
-// Converts any Bayer RGB format to ARGB.
-int BayerRGBToARGB(const uint8* src_bayer, int src_stride_bayer,
- uint32 src_fourcc_bayer,
- uint8* dst_rgb, int dst_stride_rgb,
- int width, int height);
-
-// Converts ARGB to any Bayer RGB format.
-int ARGBToBayerRGB(const uint8* src_rgb, int src_stride_rgb,
- uint8* dst_bayer, int dst_stride_bayer,
- uint32 dst_fourcc_bayer,
- int width, int height);
+LIBYUV_API
+int ARGBToBayer(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_bayer, int dst_stride_bayer,
+ int width, int height,
+ uint32 dst_fourcc_bayer);
+#ifdef __cplusplus
+} // extern "C"
} // namespace libyuv
+#endif
-#endif // INCLUDE_LIBYUV_FORMATCONVERSION_H_
+#endif // INCLUDE_LIBYUV_FORMATCONVERSION_H_ NOLINT
diff --git a/files/include/libyuv/general.h b/files/include/libyuv/general.h
deleted file mode 100644
index 58943c86..00000000
--- a/files/include/libyuv/general.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/*
- * General operations on YUV images.
- */
-
-#ifndef INCLUDE_LIBYUV_GENERAL_H_
-#define INCLUDE_LIBYUV_GENERAL_H_
-
-#include "libyuv/basic_types.h"
-
-namespace libyuv {
-
-// I420 mirror
-int
-I420Mirror(const uint8* src_yplane, int src_ystride,
- const uint8* src_uplane, int src_ustride,
- const uint8* src_vplane, int src_vstride,
- uint8* dst_yplane, int dst_ystride,
- uint8* dst_uplane, int dst_ustride,
- uint8* dst_vplane, int dst_vstride,
- int width, int height);
-
-// Crop/Pad I420 frame to match required dimensions.
-int
-I420CropPad(const uint8* src_frame, int src_width,
- int src_height, uint8* dst_frame,
- int dst_width, int dst_height);
-
-// I420 Crop - crop a rectangle from image
-int
-I420Crop(uint8* frame,
- int src_width, int src_height,
- int dst_width, int dst_height);
-
-} // namespace libyuv
-
-#endif // INCLUDE_LIBYUV_GENERAL_H_
diff --git a/files/include/libyuv/mjpeg_decoder.h b/files/include/libyuv/mjpeg_decoder.h
new file mode 100644
index 00000000..67090cf0
--- /dev/null
+++ b/files/include/libyuv/mjpeg_decoder.h
@@ -0,0 +1,188 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_ // NOLINT
+#define INCLUDE_LIBYUV_MJPEG_DECODER_H_
+
+#include "libyuv/basic_types.h"
+
+// NOTE: For a simplified public API use convert.h MJPGToI420().
+
+struct jpeg_common_struct;
+struct jpeg_decompress_struct;
+struct jpeg_source_mgr;
+
+namespace libyuv {
+
+static const uint32 kUnknownDataSize = 0xFFFFFFFF;
+
+enum JpegSubsamplingType {
+ kJpegYuv420,
+ kJpegYuv422,
+ kJpegYuv411,
+ kJpegYuv444,
+ kJpegYuv400,
+ kJpegUnknown
+};
+
+struct SetJmpErrorMgr;
+
+// MJPEG ("Motion JPEG") is a pseudo-standard video codec where the frames are
+// simply independent JPEG images with a fixed huffman table (which is omitted).
+// It is rarely used in video transmission, but is common as a camera capture
+// format, especially in Logitech devices. This class implements a decoder for
+// MJPEG frames.
+//
+// See http://tools.ietf.org/html/rfc2435
+class MJpegDecoder {
+ public:
+ typedef void (*CallbackFunction)(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows);
+
+ static const int kColorSpaceUnknown;
+ static const int kColorSpaceGrayscale;
+ static const int kColorSpaceRgb;
+ static const int kColorSpaceYCbCr;
+ static const int kColorSpaceCMYK;
+ static const int kColorSpaceYCCK;
+
+ MJpegDecoder();
+ ~MJpegDecoder();
+
+ // Loads a new frame, reads its headers, and determines the uncompressed
+ // image format. Returns true if image looks valid and format is supported.
+ // If return value is true, then the values for all the following getters
+ // are populated.
+ // src_len is the size of the compressed mjpeg frame in bytes.
+ bool LoadFrame(const uint8* src, size_t src_len);
+
+ // Returns width of the last loaded frame in pixels.
+ int GetWidth();
+
+ // Returns height of the last loaded frame in pixels.
+ int GetHeight();
+
+ // Returns format of the last loaded frame. The return value is one of the
+ // kColorSpace* constants.
+ int GetColorSpace();
+
+ // Number of color components in the color space.
+ int GetNumComponents();
+
+ // Sample factors of the n-th component.
+ int GetHorizSampFactor(int component);
+
+ int GetVertSampFactor(int component);
+
+ int GetHorizSubSampFactor(int component);
+
+ int GetVertSubSampFactor(int component);
+
+ // Public for testability.
+ int GetImageScanlinesPerImcuRow();
+
+ // Public for testability.
+ int GetComponentScanlinesPerImcuRow(int component);
+
+ // Width of a component in bytes.
+ int GetComponentWidth(int component);
+
+ // Height of a component.
+ int GetComponentHeight(int component);
+
+ // Width of a component in bytes with padding for DCTSIZE. Public for testing.
+ int GetComponentStride(int component);
+
+ // Size of a component in bytes.
+ int GetComponentSize(int component);
+
+ // Call this after LoadFrame() if you decide you don't want to decode it
+ // after all.
+ bool UnloadFrame();
+
+ // Decodes the entire image into a one-buffer-per-color-component format.
+ // dst_width must match exactly. dst_height must be <= to image height; if
+ // less, the image is cropped. "planes" must have size equal to at least
+ // GetNumComponents() and they must point to non-overlapping buffers of size
+ // at least GetComponentSize(i). The pointers in planes are incremented
+ // to point to after the end of the written data.
+ // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
+ bool DecodeToBuffers(uint8** planes, int dst_width, int dst_height);
+
+ // Decodes the entire image and passes the data via repeated calls to a
+ // callback function. Each call will get the data for a whole number of
+ // image scanlines.
+ // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
+ bool DecodeToCallback(CallbackFunction fn, void* opaque,
+ int dst_width, int dst_height);
+
+ // The helper function which recognizes the jpeg sub-sampling type.
+ static JpegSubsamplingType JpegSubsamplingTypeHelper(
+ int* subsample_x, int* subsample_y, int number_of_components);
+
+ private:
+ struct Buffer {
+ const uint8* data;
+ int len;
+ };
+
+ struct BufferVector {
+ Buffer* buffers;
+ int len;
+ int pos;
+ };
+
+ // Methods that are passed to jpeglib.
+ static int fill_input_buffer(jpeg_decompress_struct* cinfo);
+ static void init_source(jpeg_decompress_struct* cinfo);
+ static void skip_input_data(jpeg_decompress_struct* cinfo,
+ long num_bytes); // NOLINT
+ static void term_source(jpeg_decompress_struct* cinfo);
+
+ static void ErrorHandler(jpeg_common_struct* cinfo);
+
+ void AllocOutputBuffers(int num_outbufs);
+ void DestroyOutputBuffers();
+
+ bool StartDecode();
+ bool FinishDecode();
+
+ void SetScanlinePointers(uint8** data);
+ bool DecodeImcuRow();
+
+ int GetComponentScanlinePadding(int component);
+
+ // A buffer holding the input data for a frame.
+ Buffer buf_;
+ BufferVector buf_vec_;
+
+ jpeg_decompress_struct* decompress_struct_;
+ jpeg_source_mgr* source_mgr_;
+ SetJmpErrorMgr* error_mgr_;
+
+ // true iff at least one component has scanline padding. (i.e.,
+ // GetComponentScanlinePadding() != 0.)
+ bool has_scanline_padding_;
+
+ // Temporaries used to point to scanline outputs.
+ int num_outbufs_; // Outermost size of all arrays below.
+ uint8*** scanlines_;
+ int* scanlines_sizes_;
+ // Temporary buffer used for decoding when we can't decode directly to the
+ // output buffers. Large enough for just one iMCU row.
+ uint8** databuf_;
+ int* databuf_strides_;
+};
+
+} // namespace libyuv
+
+#endif // INCLUDE_LIBYUV_MJPEG_DECODER_H_ NOLINT
diff --git a/files/include/libyuv/planar_functions.h b/files/include/libyuv/planar_functions.h
index 9c0a10a3..7e43dabb 100644
--- a/files/include/libyuv/planar_functions.h
+++ b/files/include/libyuv/planar_functions.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@@ -8,155 +8,331 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-
-#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
+#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ // NOLINT
#define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
#include "libyuv/basic_types.h"
+// TODO(fbarchard): Remove the following headers includes.
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+
+#ifdef __cplusplus
namespace libyuv {
+extern "C" {
+#endif
-// Copy I420 to I420.
-int I420Copy(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
+LIBYUV_API
+void SetPlane(uint8* dst_y, int dst_stride_y,
+ int width, int height,
+ uint32 value);
-// Draw a rectangle into I420
-int I420Rect(uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int x, int y,
- int width, int height,
- int value_y, int value_u, int value_v);
+// Alias.
+#define I400ToI400 CopyPlane
-// Convert I422 to I420. Used by MJPG.
-int I422ToI420(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
+// Copy a plane of data (I420 to I400).
+LIBYUV_API
+void CopyPlane(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
int width, int height);
-// Convert NV12 to I420. Also used for NV21.
-int NV12ToI420(const uint8* src_y, int src_stride_y,
- const uint8* src_uv, int src_stride_uv,
+// Convert YUY2 to I422.
+LIBYUV_API
+int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
-// Convert NV12 to I420. Deprecated.
-int NV12ToI420(const uint8* src_y,
- const uint8* src_uv, int src_stride,
+// Convert UYVY to I422.
+int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
-// Convert Q420 to I420.
-int Q420ToI420(const uint8* src_y, int src_stride_y,
- const uint8* src_yuy2, int src_stride_yuy2,
+// Convert I420 to I400. (calls CopyPlane ignoring u/v).
+LIBYUV_API
+int I420ToI400(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
-// Convert M420 to I420.
-int M420ToI420(const uint8* src_m420, int src_stride_m420,
+// I420 mirror.
+LIBYUV_API
+int I420Mirror(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
-// Convert YUY2 to I420.
-int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
+// ARGB mirror.
+LIBYUV_API
+int ARGBMirror(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert NV12 to RGB565.
+LIBYUV_API
+int NV12ToRGB565(const uint8* src_y, int src_stride_y,
+ const uint8* src_uv, int src_stride_uv,
+ uint8* dst_rgb565, int dst_stride_rgb565,
+ int width, int height);
+
+// Convert NV21 to RGB565.
+LIBYUV_API
+int NV21ToRGB565(const uint8* src_y, int src_stride_y,
+ const uint8* src_uv, int src_stride_uv,
+ uint8* dst_rgb565, int dst_stride_rgb565,
+ int width, int height);
+
+// Aliases.
+#define ARGBToBGRA BGRAToARGB
+#define ARGBToABGR ABGRToARGB
+
+// Convert ARGB To RGBA.
+LIBYUV_API
+int ARGBToRGBA(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert ARGB To RGB24.
+LIBYUV_API
+int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_rgb24, int dst_stride_rgb24,
+ int width, int height);
+
+// Convert ARGB To RAW.
+LIBYUV_API
+int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_rgb, int dst_stride_rgb,
+ int width, int height);
+
+// Convert ARGB To RGB565.
+LIBYUV_API
+int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_rgb565, int dst_stride_rgb565,
+ int width, int height);
+
+// Convert ARGB To ARGB1555.
+LIBYUV_API
+int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb1555, int dst_stride_argb1555,
+ int width, int height);
+
+// Convert ARGB To ARGB4444.
+LIBYUV_API
+int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb4444, int dst_stride_argb4444,
+ int width, int height);
+
+// Convert ARGB to I400.
+LIBYUV_API
+int ARGBToI400(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
int width, int height);
-// Convert UYVY to I420.
-int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
+// ARGB little endian (bgra in memory) to I422.
+LIBYUV_API
+int ARGBToI422(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
-// Convert I420 to ARGB.
-int I420ToARGB(const uint8* src_y, int src_stride_y,
+// I422ToARGB is in convert_argb.h
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
+ uint8* dst_bgra, int dst_stride_bgra,
int width, int height);
-// Convert I420 to BGRA.
-int I420ToBGRA(const uint8* src_y, int src_stride_y,
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
+ uint8* dst_abgr, int dst_stride_abgr,
int width, int height);
-// Convert I420 to ABGR.
-int I420ToABGR(const uint8* src_y, int src_stride_y,
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
+ uint8* dst_rgba, int dst_stride_rgba,
+ int width, int height);
+
+// Draw a rectangle into I420.
+LIBYUV_API
+int I420Rect(uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int x, int y, int width, int height,
+ int value_y, int value_u, int value_v);
+
+// Draw a rectangle into ARGB.
+LIBYUV_API
+int ARGBRect(uint8* dst_argb, int dst_stride_argb,
+ int x, int y, int width, int height, uint32 value);
+
+// Convert ARGB to gray scale ARGB.
+LIBYUV_API
+int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
-// Convert I422 to ARGB.
-int I422ToARGB(const uint8* src_y, int src_stride_y,
+// Make a rectangle of ARGB gray scale.
+LIBYUV_API
+int ARGBGray(uint8* dst_argb, int dst_stride_argb,
+ int x, int y, int width, int height);
+
+// Make a rectangle of ARGB Sepia tone.
+LIBYUV_API
+int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
+ int x, int y, int width, int height);
+
+// Apply a matrix rotation to each ARGB pixel.
+// matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1.
+// The first 4 coefficients apply to B, G, R, A and produce B of the output.
+// The next 4 coefficients apply to B, G, R, A and produce G of the output.
+// The last 4 coefficients apply to B, G, R, A and produce R of the output.
+LIBYUV_API
+int ARGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+ const int8* matrix_argb,
+ int x, int y, int width, int height);
+
+// Apply a color table each ARGB pixel.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
+ const uint8* table_argb,
+ int x, int y, int width, int height);
+
+// Quantize a rectangle of ARGB. Alpha unaffected.
+// scale is a 16 bit fractional fixed point scaler between 0 and 65535.
+// interval_size should be a value between 1 and 255.
+// interval_offset should be a value between 0 and 255.
+LIBYUV_API
+int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
+ int scale, int interval_size, int interval_offset,
+ int x, int y, int width, int height);
+
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+typedef void (*ARGBBlendRow)(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+
+// Get function to Alpha Blend ARGB pixels and store to destination.
+LIBYUV_API
+ARGBBlendRow GetARGBBlend();
+
+// Alpha Blend ARGB images and store to destination.
+// Alpha of destination is set to 255.
+LIBYUV_API
+int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
+ const uint8* src_argb1, int src_stride_argb1,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert I422 to YUY2.
+LIBYUV_API
+int I422ToYUY2(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
+ uint8* dst_frame, int dst_stride_frame,
int width, int height);
-// Convert I444 to ARGB.
-int I444ToARGB(const uint8* src_y, int src_stride_y,
+// Convert I422 to UYVY.
+LIBYUV_API
+int I422ToUYVY(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
+ uint8* dst_frame, int dst_stride_frame,
int width, int height);
-// Convert I400 to ARGB.
-int I400ToARGB(const uint8* src_y, int src_stride_y,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+// Convert unattentuated ARGB to preattenuated ARGB.
+LIBYUV_API
+int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
-// Convert I400 to ARGB. Reverse of ARGBToI400
-int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+// Convert preattentuated ARGB to unattenuated ARGB.
+LIBYUV_API
+int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
-// Convert RAW to ARGB.
-int RAWToARGB(const uint8* src_raw, int src_stride_raw,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+// Convert MJPG to ARGB.
+LIBYUV_API
+int MJPGToARGB(const uint8* sample, size_t sample_size,
+ uint8* argb, int argb_stride,
+ int w, int h, int dw, int dh);
-// Convert BG24 to ARGB.
-int BG24ToARGB(const uint8* src_bg24, int src_stride_bg24,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+// Computes table of cumulative sum for image where the value is the sum
+// of all values above and to the left of the entry. Used by ARGBBlur.
+LIBYUV_API
+int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
+ int32* dst_cumsum, int dst_stride32_cumsum,
+ int width, int height);
-// Convert ABGR to ARGB. Also used for ARGB to ABGR.
-int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+// Blur ARGB image.
+// Caller should allocate dst_cumsum table of width * height * 16 bytes aligned
+// to 16 byte boundary.
+LIBYUV_API
+int ARGBBlur(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int32* dst_cumsum, int dst_stride32_cumsum,
+ int width, int height, int radius);
-// Convert BGRA to ARGB. Also used for ARGB to BGRA.
-int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
+// Multiply ARGB image by ARGB value.
+LIBYUV_API
+int ARGBShade(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height, uint32 value);
-// Convert ARGB to I400.
-int ARGBToI400(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- int width, int height);
+// Interpolate between two ARGB images using specified amount of interpolation
+// (0 to 255) and store to destination.
+// 'interpolation' is specified as 8 bit fraction where 0 means 100% src_argb0
+// and 255 means 1% src_argb0 and 99% src_argb1.
+// Internally uses ARGBScale bilinear filtering.
+// Caveat: This function will write up to 16 bytes beyond the end of dst_argb.
+LIBYUV_API
+int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
+ const uint8* src_argb1, int src_stride_argb1,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height, int interpolation);
+
+#if defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
+ defined(TARGET_IPHONE_SIMULATOR)
+#define YUV_DISABLE_ASM
+#endif
+// Row functions for copying a pixels from a source with a slope to a row
+// of destination. Useful for scaling, rotation, mirror, texture mapping.
+LIBYUV_API
+void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
+ uint8* dst_argb, const float* uv_dudv, int width);
+// The following are available on all x86 platforms:
+#if !defined(YUV_DISABLE_ASM) && \
+ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+ uint8* dst_argb, const float* uv_dudv, int width);
+#define HAS_ARGBAFFINEROW_SSE2
+#endif
+#ifdef __cplusplus
+} // extern "C"
} // namespace libyuv
+#endif
-#endif // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
+#endif // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ NOLINT
diff --git a/files/include/libyuv/rotate.h b/files/include/libyuv/rotate.h
index 65c38de3..e7608a2d 100644
--- a/files/include/libyuv/rotate.h
+++ b/files/include/libyuv/rotate.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@@ -8,45 +8,103 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef INCLUDE_LIBYUV_ROTATE_H_
+#ifndef INCLUDE_LIBYUV_ROTATE_H_ // NOLINT
#define INCLUDE_LIBYUV_ROTATE_H_
#include "libyuv/basic_types.h"
+#ifdef __cplusplus
namespace libyuv {
+extern "C" {
+#endif
-// Supported rotation
+// Supported rotation.
enum RotationMode {
- kRotate0 = 0, // No rotation
- kRotate90 = 90, // Rotate 90 degrees clockwise
- kRotate180 = 180, // Rotate 180 degrees
- kRotate270 = 270, // Rotate 270 degrees clockwise
+ kRotate0 = 0, // No rotation.
+ kRotate90 = 90, // Rotate 90 degrees clockwise.
+ kRotate180 = 180, // Rotate 180 degrees.
+ kRotate270 = 270, // Rotate 270 degrees clockwise.
- // Deprecated
+ // Deprecated.
kRotateNone = 0,
kRotateClockwise = 90,
kRotateCounterClockwise = 270,
};
-// Rotate I420 frame
+// Rotate I420 frame.
+LIBYUV_API
int I420Rotate(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
- int width, int height,
- RotationMode mode);
+ int src_width, int src_height, RotationMode mode);
-// Rotate NV12 input and store in I420
+// Rotate NV12 input and store in I420.
+LIBYUV_API
int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
const uint8* src_uv, int src_stride_uv,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
- int width, int height,
- RotationMode mode);
+ int src_width, int src_height, RotationMode mode);
+// Rotate planes by 90, 180, 270
+LIBYUV_API
+void RotatePlane90(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height);
+
+LIBYUV_API
+void RotatePlane180(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height);
+
+LIBYUV_API
+void RotatePlane270(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height);
+
+LIBYUV_API
+void RotateUV90(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width, int height);
+
+// Rotations for when U and V are interleaved.
+// These functions take one input pointer and
+// split the data into two buffers while
+// rotating them.
+LIBYUV_API
+void RotateUV180(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width, int height);
+
+LIBYUV_API
+void RotateUV270(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width, int height);
+
+// The 90 and 270 functions are based on transposes.
+// Doing a transpose with reversing the read/write
+// order will result in a rotation by +- 90 degrees.
+LIBYUV_API
+void TransposePlane(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height);
+
+LIBYUV_API
+void TransposeUV(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width, int height);
+
+#ifdef __cplusplus
+} // extern "C"
} // namespace libyuv
+#endif
-#endif // INCLUDE_LIBYUV_ROTATE_H_
+#endif // INCLUDE_LIBYUV_ROTATE_H_ NOLINT
diff --git a/files/include/libyuv/rotate_argb.h b/files/include/libyuv/rotate_argb.h
new file mode 100644
index 00000000..a2781df3
--- /dev/null
+++ b/files/include/libyuv/rotate_argb.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_ // NOLINT
+#define INCLUDE_LIBYUV_ROTATE_ARGB_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/rotate.h" // For RotationMode.
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Rotate ARGB frame
+LIBYUV_API
+int ARGBRotate(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int src_width, int src_height, RotationMode mode);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_ROTATE_ARGB_H_ NOLINT
diff --git a/files/include/libyuv/row.h b/files/include/libyuv/row.h
new file mode 100644
index 00000000..4814f254
--- /dev/null
+++ b/files/include/libyuv/row.h
@@ -0,0 +1,731 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROW_H_ // NOLINT
+#define INCLUDE_LIBYUV_ROW_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// TODO(fbarchard): Remove kMaxStride
+#define kMaxStride (2880 * 4)
+#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
+
+#if defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
+ defined(TARGET_IPHONE_SIMULATOR)
+#define YUV_DISABLE_ASM
+#endif
+// True if compiling for SSSE3 as a requirement.
+#if defined(__SSSE3__) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 3))
+#define LIBYUV_SSSE3_ONLY
+#endif
+
+// The following are available on all x86 platforms:
+#if !defined(YUV_DISABLE_ASM) && \
+ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+// Conversions.
+#define HAS_ABGRTOARGBROW_SSSE3
+#define HAS_ABGRTOUVROW_SSSE3
+#define HAS_ABGRTOYROW_SSSE3
+#define HAS_ARGB1555TOARGBROW_SSE2
+#define HAS_ARGB4444TOARGBROW_SSE2
+#define HAS_ARGBTOARGB1555ROW_SSE2
+#define HAS_ARGBTOARGB4444ROW_SSE2
+#define HAS_ARGBTORAWROW_SSSE3
+#define HAS_ARGBTORGB24ROW_SSSE3
+#define HAS_ARGBTORGB565ROW_SSE2
+#define HAS_ARGBTORGBAROW_SSSE3
+#define HAS_ARGBTOUVROW_SSSE3
+#define HAS_ARGBTOYROW_SSSE3
+#define HAS_BGRATOARGBROW_SSSE3
+#define HAS_BGRATOUVROW_SSSE3
+#define HAS_BGRATOYROW_SSSE3
+#define HAS_COPYROW_SSE2
+#define HAS_COPYROW_X86
+#define HAS_I400TOARGBROW_SSE2
+#define HAS_I411TOARGBROW_SSSE3
+#define HAS_I422TOABGRROW_SSSE3
+#define HAS_I422TOARGBROW_SSSE3
+#define HAS_I422TOBGRAROW_SSSE3
+#define HAS_I444TOARGBROW_SSSE3
+#define HAS_MIRRORROW_SSSE3
+#define HAS_MIRRORROWUV_SSSE3
+#define HAS_NV12TOARGBROW_SSSE3
+#define HAS_NV21TOARGBROW_SSSE3
+#define HAS_RAWTOARGBROW_SSSE3
+#define HAS_RGB24TOARGBROW_SSSE3
+#define HAS_RGB565TOARGBROW_SSE2
+#define HAS_SETROW_X86
+#define HAS_SPLITUV_SSE2
+#define HAS_UYVYTOUV422ROW_SSE2
+#define HAS_UYVYTOUVROW_SSE2
+#define HAS_UYVYTOYROW_SSE2
+#define HAS_YTOARGBROW_SSE2
+#define HAS_YUY2TOUV422ROW_SSE2
+#define HAS_YUY2TOUVROW_SSE2
+#define HAS_YUY2TOYROW_SSE2
+
+// Effects
+#define HAS_ARGBAFFINEROW_SSE2
+#define HAS_ARGBATTENUATEROW_SSSE3
+#define HAS_ARGBBLENDROW_SSSE3
+#define HAS_ARGBCOLORMATRIXROW_SSSE3
+#define HAS_ARGBGRAYROW_SSSE3
+#define HAS_ARGBINTERPOLATEROW_SSSE3
+#define HAS_ARGBMIRRORROW_SSSE3
+#define HAS_ARGBQUANTIZEROW_SSE2
+#define HAS_ARGBSEPIAROW_SSSE3
+#define HAS_ARGBSHADE_SSE2
+#define HAS_ARGBUNATTENUATEROW_SSE2
+#define HAS_COMPUTECUMULATIVESUMROW_SSE2
+#define HAS_CUMULATIVESUMTOAVERAGE_SSE2
+#endif
+
+// The following are Windows only:
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
+#define HAS_ABGRTOARGBROW_SSSE3
+#define HAS_ARGBCOLORTABLEROW_X86
+#define HAS_I422TORGBAROW_SSSE3
+#define HAS_RGBATOARGBROW_SSSE3
+#define HAS_RGBATOUVROW_SSSE3
+#define HAS_RGBATOYROW_SSSE3
+#endif
+
+// The following are disabled when SSSE3 is available:
+#if !defined(YUV_DISABLE_ASM) && \
+ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
+ !defined(LIBYUV_SSSE3_ONLY)
+#define HAS_ARGBATTENUATE_SSE2
+#define HAS_ARGBBLENDROW_SSE2
+#define HAS_MIRRORROW_SSE2
+#endif
+
+// The following are available on Neon platforms
+#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define HAS_COPYROW_NEON
+#define HAS_I422TOABGRROW_NEON
+#define HAS_I422TOARGBROW_NEON
+#define HAS_I422TOBGRAROW_NEON
+#define HAS_I422TORAWROW_NEON
+#define HAS_I422TORGB24ROW_NEON
+#define HAS_I422TORGBAROW_NEON
+#define HAS_MIRRORROW_NEON
+#define HAS_MIRRORROWUV_NEON
+#define HAS_SETROW_NEON
+#define HAS_SPLITUV_NEON
+#define HAS_UYVYTOUV422ROW_NEON
+#define HAS_UYVYTOUVROW_NEON
+#define HAS_UYVYTOYROW_NEON
+#define HAS_YUY2TOUV422ROW_NEON
+#define HAS_YUY2TOUVROW_NEON
+#define HAS_YUY2TOYROW_NEON
+
+// TODO(fbarchard): Hook these up to calling functions.
+#define HAS_ABGRTOARGBROW_NEON
+#define HAS_ARGBTORAWROW_NEON
+#define HAS_ARGBTORGB24ROW_NEON
+#define HAS_ARGBTORGBAROW_NEON
+#define HAS_BGRATOARGBROW_NEON
+#define HAS_NV12TOARGBROW_NEON
+#define HAS_NV21TOARGBROW_NEON
+#define HAS_RAWTOARGBROW_NEON
+#define HAS_RGB24TOARGBROW_NEON
+#define HAS_RGBATOARGBROW_NEON
+#endif
+
+#if defined(_MSC_VER) && !defined(__CLR_VER)
+#define SIMD_ALIGNED(var) __declspec(align(16)) var
+typedef __declspec(align(16)) int8 vec8[16];
+typedef __declspec(align(16)) uint8 uvec8[16];
+typedef __declspec(align(16)) int16 vec16[8];
+typedef __declspec(align(16)) uint16 uvec16[8];
+typedef __declspec(align(16)) int32 vec32[4];
+typedef __declspec(align(16)) uint32 uvec32[4];
+#elif defined(__GNUC__)
+#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
+typedef int8 __attribute__((vector_size(16))) vec8;
+typedef uint8 __attribute__((vector_size(16))) uvec8;
+typedef int16 __attribute__((vector_size(16))) vec16;
+typedef uint16 __attribute__((vector_size(16))) uvec16;
+typedef int32 __attribute__((vector_size(16))) vec32;
+typedef uint32 __attribute__((vector_size(16))) uvec32;
+#else
+#define SIMD_ALIGNED(var) var
+typedef int8 vec8[16];
+typedef uint8 uvec8[16];
+typedef int16 vec16[8];
+typedef uint16 uvec16[8];
+typedef int32 vec32[4];
+typedef uint32 uvec32[4];
+#endif
+
+#if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
+#define OMITFP
+#else
+#define OMITFP __attribute__((optimize("omit-frame-pointer")))
+#endif
+
+void I422ToARGBRow_NEON(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+void I422ToBGRARow_NEON(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+void I422ToABGRRow_NEON(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+void I422ToRGBARow_NEON(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+void I422ToRGB24Row_NEON(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+void I422ToRAWRow_NEON(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+void NV12ToARGBRow_NEON(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* rgb_buf,
+ int width);
+void NV21ToARGBRow_NEON(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* rgb_buf,
+ int width);
+
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+
+void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+
+void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
+void MirrorRow_SSE2(const uint8* src, uint8* dst, int width);
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
+void MirrorRow_C(const uint8* src, uint8* dst, int width);
+
+void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, int width);
+void MirrorRowUV_NEON(const uint8* src, uint8* dst_u, uint8* dst_v, int width);
+void MirrorRowUV_C(const uint8* src, uint8* dst_u, uint8* dst_v, int width);
+
+void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width);
+
+void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+
+void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
+void CopyRow_X86(const uint8* src, uint8* dst, int count);
+void CopyRow_NEON(const uint8* src, uint8* dst, int count);
+void CopyRow_C(const uint8* src, uint8* dst, int count);
+
+void SetRow8_X86(uint8* dst, uint32 v32, int count);
+void SetRows32_X86(uint8* dst, uint32 v32, int width,
+ int dst_stride, int height);
+void SetRow8_NEON(uint8* dst, uint32 v32, int count);
+void SetRows32_NEON(uint8* dst, uint32 v32, int width,
+ int dst_stride, int height);
+void SetRow8_C(uint8* dst, uint32 v32, int count);
+void SetRows32_C(uint8* dst, uint32 v32, int width, int dst_stride, int height);
+
+void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void ABGRToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void RGBAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+
+void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void RGBAToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+
+void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix);
+void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix);
+void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix);
+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void ARGB1555ToARGBRow_SSE2(const uint8* src_argb, uint8* dst_argb, int pix);
+void RGB565ToARGBRow_SSE2(const uint8* src_argb, uint8* dst_argb, int pix);
+void ARGB4444ToARGBRow_SSE2(const uint8* src_argb, uint8* dst_argb, int pix);
+
+void BGRAToARGBRow_NEON(const uint8* src_bgra, uint8* dst_argb, int pix);
+void ABGRToARGBRow_NEON(const uint8* src_abgr, uint8* dst_argb, int pix);
+void RGBAToARGBRow_NEON(const uint8* src_rgba, uint8* dst_argb, int pix);
+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);
+
+void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix);
+void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix);
+void RGBAToARGBRow_C(const uint8* src_rgba, uint8* dst_argb, int pix);
+void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix);
+void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
+void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
+
+void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+void ARGBToRGBARow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
+void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
+
+void I444ToARGBRow_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* argb_buf,
+ int width);
+
+void I422ToARGBRow_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* argb_buf,
+ int width);
+
+void I411ToARGBRow_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+
+void NV12ToARGBRow_C(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* argb_buf,
+ int width);
+
+void NV21ToARGBRow_C(const uint8* y_buf,
+ const uint8* vu_buf,
+ uint8* argb_buf,
+ int width);
+
+void I422ToBGRARow_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* bgra_buf,
+ int width);
+
+void I422ToABGRRow_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* abgr_buf,
+ int width);
+
+void I422ToRGBARow_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgba_buf,
+ int width);
+void I422ToRGB24Row_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb24_buf,
+ int width);
+void I422ToRAWRow_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* raw_buf,
+ int width);
+
+void YToARGBRow_C(const uint8* y_buf,
+ uint8* rgb_buf,
+ int width);
+
+void I444ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* argb_buf,
+ int width);
+
+void I422ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* argb_buf,
+ int width);
+
+void I411ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+
+void NV12ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* argb_buf,
+ int width);
+
+void NV21ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* vu_buf,
+ uint8* argb_buf,
+ int width);
+
+void I422ToBGRARow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* bgra_buf,
+ int width);
+
+void I422ToABGRRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* abgr_buf,
+ int width);
+
+void I422ToRGBARow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgba_buf,
+ int width);
+
+void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* argb_buf,
+ int width);
+
+void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* argb_buf,
+ int width);
+
+void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+
+void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* argb_buf,
+ int width);
+
+void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* vu_buf,
+ uint8* argb_buf,
+ int width);
+
+void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* bgra_buf,
+ int width);
+
+void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* abgr_buf,
+ int width);
+
+void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgba_buf,
+ int width);
+
+void I444ToARGBRow_Any_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* argb_buf,
+ int width);
+
+void I422ToARGBRow_Any_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* argb_buf,
+ int width);
+
+void I411ToARGBRow_Any_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+
+void NV12ToARGBRow_Any_SSSE3(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* argb_buf,
+ int width);
+
+void NV21ToARGBRow_Any_SSSE3(const uint8* y_buf,
+ const uint8* vu_buf,
+ uint8* argb_buf,
+ int width);
+
+void I422ToBGRARow_Any_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* bgra_buf,
+ int width);
+
+void I422ToABGRRow_Any_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* abgr_buf,
+ int width);
+
+void I422ToRGBARow_Any_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgba_buf,
+ int width);
+
+void YToARGBRow_SSE2(const uint8* y_buf,
+ uint8* argb_buf,
+ int width);
+
+// ARGB preattenuated alpha blend.
+void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+
+void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ABGRToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void RGBAToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void RGBAToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void I422ToARGBRow_Any_NEON(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+void I422ToBGRARow_Any_NEON(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+void I422ToABGRRow_Any_NEON(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+void I422ToRGBARow_Any_NEON(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+void I422ToRGB24Row_Any_NEON(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+void I422ToRAWRow_Any_NEON(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+void NV12ToARGBRow_Any_NEON(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* argb_buf,
+ int width);
+void NV21ToARGBRow_Any_NEON(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* argb_buf,
+ int width);
+
+void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
+ uint8* dst_y, int pix);
+void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_C(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_Any_SSE2(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_Any_NEON(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+
+void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
+ uint8* dst_y, int pix);
+void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_NEON(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+
+void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_C(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_Any_SSE2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_Any_NEON(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+
+void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
+
+// Inverse table for unattenuate, shared by C and SSE2.
+extern uint32 fixed_invtbl8[256];
+void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
+
+void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
+
+void ARGBSepiaRow_C(uint8* dst_argb, int width);
+void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width);
+
+void ARGBColorMatrixRow_C(uint8* dst_argb, const int8* matrix_argb, int width);
+void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
+ int width);
+
+void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
+
+void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
+ int interval_offset, int width);
+void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
+ int interval_offset, int width);
+
+// Used for blur.
+void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
+ int width, int area, uint8* dst, int count);
+void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
+ const int32* previous_cumsum, int width);
+
+void CumulativeSumToAverage_C(const int32* topleft, const int32* botleft,
+ int width, int area, uint8* dst, int count);
+void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
+ const int32* previous_cumsum, int width);
+
+void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+ uint32 value);
+void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
+ uint32 value);
+
+LIBYUV_API
+void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
+ uint8* dst_argb, const float* uv_dudv, int width);
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+ uint8* dst_argb, const float* uv_dudv, int width);
+
+void ARGBInterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width, int source_y_fraction);
+void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_ROW_H_ NOLINT
+
diff --git a/files/include/libyuv/scale.h b/files/include/libyuv/scale.h
index 8433908b..18098798 100644
--- a/files/include/libyuv/scale.h
+++ b/files/include/libyuv/scale.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@@ -8,20 +8,31 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef INCLUDE_LIBYUV_SCALE_H_
+#ifndef INCLUDE_LIBYUV_SCALE_H_ // NOLINT
#define INCLUDE_LIBYUV_SCALE_H_
#include "libyuv/basic_types.h"
+#ifdef __cplusplus
namespace libyuv {
+extern "C" {
+#endif
// Supported filtering
enum FilterMode {
- kFilterNone = 0, // Point sample; Fastest
+ kFilterNone = 0, // Point sample; Fastest.
kFilterBilinear = 1, // Faster than box, but lower quality scaling down.
- kFilterBox = 2 // Highest quality
+ kFilterBox = 2 // Highest quality.
};
+// Scale a YUV plane.
+LIBYUV_API
+void ScalePlane(const uint8* src, int src_stride,
+ int src_width, int src_height,
+ uint8* dst, int dst_stride,
+ int dst_width, int dst_height,
+ FilterMode filtering);
+
// Scales a YUV 4:2:0 image from the src width and height to the
// dst width and height.
// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
@@ -32,6 +43,7 @@ enum FilterMode {
// quality image, at further expense of speed.
// Returns 0 if successful.
+LIBYUV_API
int I420Scale(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
@@ -42,15 +54,8 @@ int I420Scale(const uint8* src_y, int src_stride_y,
int dst_width, int dst_height,
FilterMode filtering);
-// Legacy API
-// If dst_height_offset is non-zero, the image is offset by that many pixels
-// and stretched to (dst_height - dst_height_offset * 2) pixels high,
-// instead of dst_height.
-int Scale(const uint8* src, int src_width, int src_height,
- uint8* dst, int dst_width, int dst_height, int dst_height_offset,
- bool interpolate);
-
-// Same, but specified src terms of each plane location and stride.
+// Legacy API. Deprecated.
+LIBYUV_API
int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
int src_stride_y, int src_stride_u, int src_stride_v,
int src_width, int src_height,
@@ -59,9 +64,19 @@ int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
int dst_width, int dst_height,
bool interpolate);
-// For testing, allow disabling of optimizations.
+// Legacy API. Deprecated.
+LIBYUV_API
+int ScaleOffset(const uint8* src, int src_width, int src_height,
+ uint8* dst, int dst_width, int dst_height, int dst_yoffset,
+ bool interpolate);
+
+// For testing, allow disabling of specialized scalers.
+LIBYUV_API
void SetUseReferenceImpl(bool use);
-} // namespace libyuv
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
-#endif // INCLUDE_LIBYUV_SCALE_H_
+#endif // INCLUDE_LIBYUV_SCALE_H_ NOLINT
diff --git a/files/include/libyuv/scale_argb.h b/files/include/libyuv/scale_argb.h
new file mode 100644
index 00000000..1af0e1dc
--- /dev/null
+++ b/files/include/libyuv/scale_argb.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_ // NOLINT
+#define INCLUDE_LIBYUV_SCALE_ARGB_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/scale.h" // For FilterMode
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+LIBYUV_API
+int ARGBScale(const uint8* src_argb, int src_stride_argb,
+ int src_width, int src_height,
+ uint8* dst_argb, int dst_stride_argb,
+ int dst_width, int dst_height,
+ FilterMode filtering);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_SCALE_ARGB_H_ NOLINT
diff --git a/files/include/libyuv/version.h b/files/include/libyuv/version.h
new file mode 100644
index 00000000..e782ae18
--- /dev/null
+++ b/files/include/libyuv/version.h
@@ -0,0 +1,16 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
+#define INCLUDE_LIBYUV_VERSION_H_
+
+#define LIBYUV_VERSION 397
+
+#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
diff --git a/files/include/libyuv/video_common.h b/files/include/libyuv/video_common.h
new file mode 100644
index 00000000..5d812c98
--- /dev/null
+++ b/files/include/libyuv/video_common.h
@@ -0,0 +1,159 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Common definitions for video, including fourcc and VideoFormat.
+
+#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_ // NOLINT
+#define INCLUDE_LIBYUV_VIDEO_COMMON_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+// Definition of FourCC codes
+//////////////////////////////////////////////////////////////////////////////
+
+// Convert four characters to a FourCC code.
+// Needs to be a macro otherwise the OS X compiler complains when the kFormat*
+// constants are used in a switch.
+#define FOURCC(a, b, c, d) ( \
+ (static_cast<uint32>(a)) | (static_cast<uint32>(b) << 8) | \
+ (static_cast<uint32>(c) << 16) | (static_cast<uint32>(d) << 24))
+
+// Some pages discussing FourCC codes:
+// http://www.fourcc.org/yuv.php
+// http://v4l2spec.bytesex.org/spec/book1.htm
+// http://developer.apple.com/quicktime/icefloe/dispatch020.html
+// http://msdn.microsoft.com/library/windows/desktop/dd206750.aspx#nv12
+// http://people.xiph.org/~xiphmont/containers/nut/nut4cc.txt
+
+enum FourCC {
+ // Canonical fourcc codes used in our code.
+ FOURCC_I420 = FOURCC('I', '4', '2', '0'),
+ FOURCC_I422 = FOURCC('I', '4', '2', '2'),
+ FOURCC_I444 = FOURCC('I', '4', '4', '4'),
+ FOURCC_I411 = FOURCC('I', '4', '1', '1'),
+ FOURCC_I400 = FOURCC('I', '4', '0', '0'),
+ FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'), // Linux version of I420.
+ FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
+ FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
+ FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
+ FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
+ FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
+ FOURCC_M420 = FOURCC('M', '4', '2', '0'),
+ FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),
+ FOURCC_V210 = FOURCC('V', '2', '1', '0'),
+ FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
+ FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
+ FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
+ FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
+ FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
+ FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'), // bgr565.
+ FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'), // abgr1555.
+ FOURCC_R444 = FOURCC('R', '4', '4', '4'), // argb4444.
+ FOURCC_RAW = FOURCC('r', 'a', 'w', ' '),
+ FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
+ FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
+ FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
+ FOURCC_H264 = FOURCC('H', '2', '6', '4'),
+ // Next four are Bayer RGB formats. The four characters define the order of
+ // the colours in each 2x2 pixel grid, going left-to-right and top-to-bottom.
+ FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
+ FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
+ FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
+ FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
+
+ // Aliases for canonical fourcc codes, replaced with their canonical
+ // equivalents by CanonicalFourCC().
+ FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420.
+ FOURCC_YU16 = FOURCC('Y', 'U', '1', '6'), // Alias for I422.
+ FOURCC_YU24 = FOURCC('Y', 'U', '2', '4'), // Alias for I444.
+ FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'), // Alias for YUY2.
+ FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'), // Alias for YUY2 on Mac.
+ FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'), // Alias for UYVY.
+ FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'), // Alias for UYVY.
+ FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'), // Alias for MJPG.
+ FOURCC_DMB1 = FOURCC('d', 'm', 'b', '1'), // Alias for MJPG on Mac.
+ FOURCC_BA81 = FOURCC('B', 'A', '8', '1'), // Alias for BGGR.
+ FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'), // Alias for RAW.
+ FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'), // Alias for 24BG.
+
+ // Match any fourcc.
+ FOURCC_ANY = 0xFFFFFFFF,
+};
+
+enum FourCCBpp {
+ // Canonical fourcc codes used in our code.
+ FOURCC_BPP_I420 = 12,
+ FOURCC_BPP_I422 = 16,
+ FOURCC_BPP_I444 = 24,
+ FOURCC_BPP_I411 = 12,
+ FOURCC_BPP_I400 = 8,
+ FOURCC_BPP_YU12 = 12,
+ FOURCC_BPP_YV12 = 12,
+ FOURCC_BPP_YV16 = 16,
+ FOURCC_BPP_YV24 = 24,
+ FOURCC_BPP_YUY2 = 16,
+ FOURCC_BPP_UYVY = 16,
+ FOURCC_BPP_M420 = 12,
+ FOURCC_BPP_Q420 = 12,
+ FOURCC_BPP_V210 = 22, // 128 / 6 actually.
+ FOURCC_BPP_24BG = 24,
+ FOURCC_BPP_ARGB = 32,
+ FOURCC_BPP_BGRA = 32,
+ FOURCC_BPP_ABGR = 32,
+ FOURCC_BPP_RGBA = 32,
+ FOURCC_BPP_RGBP = 16,
+ FOURCC_BPP_RGBO = 16,
+ FOURCC_BPP_R444 = 16,
+ FOURCC_BPP_RAW = 24,
+ FOURCC_BPP_NV21 = 12,
+ FOURCC_BPP_NV12 = 12,
+ FOURCC_BPP_MJPG = 0, // 0 means unknown.
+ FOURCC_BPP_H264 = 0,
+ // Next four are Bayer RGB formats. The four characters define the order of
+ // the colours in each 2x2 pixel grid, going left-to-right and top-to-bottom.
+ FOURCC_BPP_RGGB = 8,
+ FOURCC_BPP_BGGR = 8,
+ FOURCC_BPP_GRBG = 8,
+ FOURCC_BPP_GBRG = 8,
+
+ // Aliases for canonical fourcc codes, replaced with their canonical
+ // equivalents by CanonicalFourCC().
+ FOURCC_BPP_IYUV = 12,
+ FOURCC_BPP_YU16 = 16,
+ FOURCC_BPP_YU24 = 24,
+ FOURCC_BPP_YUYV = 16,
+ FOURCC_BPP_YUVS = 16,
+ FOURCC_BPP_HDYC = 16,
+ FOURCC_BPP_2VUY = 16,
+ FOURCC_BPP_JPEG = 1,
+ FOURCC_BPP_DMB1 = 1,
+ FOURCC_BPP_BA81 = 8,
+ FOURCC_BPP_RGB3 = 24,
+ FOURCC_BPP_BGR3 = 24,
+
+ // Match any fourcc.
+ FOURCC_BPP_ANY = 0, // 0 means unknown.
+};
+
+// Converts fourcc aliases into canonical ones.
+LIBYUV_API uint32 CanonicalFourCC(uint32 fourcc);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_VIDEO_COMMON_H_ NOLINT
diff --git a/files/libyuv.gyp b/files/libyuv.gyp
index d5abab73..18137538 100644
--- a/files/libyuv.gyp
+++ b/files/libyuv.gyp
@@ -1,4 +1,4 @@
-# Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+# Copyright 2011 The LibYuv Project Authors. All rights reserved.
#
# Use of this source code is governed by a BSD-style license
# that can be found in the LICENSE file in the root of the source
@@ -7,60 +7,85 @@
# be found in the AUTHORS file in the root of the source tree.
{
+ 'variables': {
+ 'use_system_libjpeg%': 0,
+ },
'targets': [
{
'target_name': 'libyuv',
'type': 'static_library',
+ # 'type': 'shared_library',
+ 'conditions': [
+ ['use_system_libjpeg==0', {
+ 'dependencies': [
+ '<(DEPTH)/third_party/libjpeg_turbo/libjpeg.gyp:libjpeg',
+ ],
+ }, {
+ 'link_settings': {
+ 'libraries': [
+ '-ljpeg',
+ ],
+ },
+ }],
+ ],
+ 'defines': [
+ 'HAVE_JPEG',
+ # 'LIBYUV_BUILDING_SHARED_LIBRARY',
+ ],
'include_dirs': [
- 'common',
'include',
+ '.',
],
'direct_dependent_settings': {
'include_dirs': [
- 'common',
'include',
+ '.',
],
},
'sources': [
- # includes
- 'include/convert.h',
- 'include/general.h',
- 'include/scale.h',
- 'include/planar_functions.h',
+ # includes.
+ 'include/libyuv.h',
+ 'include/libyuv/basic_types.h',
+ 'include/libyuv/compare.h',
+ 'include/libyuv/convert.h',
+ 'include/libyuv/convert_argb.h',
+ 'include/libyuv/convert_from.h',
+ 'include/libyuv/cpu_id.h',
+ 'include/libyuv/format_conversion.h',
+ 'include/libyuv/mjpeg_decoder.h',
+ 'include/libyuv/planar_functions.h',
+ 'include/libyuv/rotate.h',
+ 'include/libyuv/rotate_argb.h',
+ 'include/libyuv/row.h',
+ 'include/libyuv/scale.h',
+ 'include/libyuv/scale_argb.h',
+ 'include/libyuv/version.h',
+ 'include/libyuv/video_common.h',
- # headers
- 'common/basic_types.h',
- 'common/common.h',
- 'common/constructor_magic.h',
- 'source/cpu_id.h',
- 'source/rotate.h'
- 'source/row.h',
- 'source/video_common.h',
-
- # sources
+ # sources.
+ 'source/compare.cc',
+ 'source/compare_neon.cc',
'source/convert.cc',
+ 'source/convert_argb.cc',
+ 'source/convert_from.cc',
'source/cpu_id.cc',
'source/format_conversion.cc',
- 'source/general.cc',
+ 'source/mjpeg_decoder.cc',
'source/planar_functions.cc',
'source/rotate.cc',
- 'source/row_table.cc',
+ 'source/rotate_argb.cc',
+ 'source/rotate_neon.cc',
+ 'source/row_common.cc',
+ 'source/row_neon.cc',
+ 'source/row_posix.cc',
+ 'source/row_win.cc',
'source/scale.cc',
+ 'source/scale_neon.cc',
+ 'source/scale_argb.cc',
'source/video_common.cc',
],
- 'conditions': [
- ['OS=="win"', {
- 'sources': [
- 'source/row_win.cc',
- ],
- },{ # else
- 'sources': [
- 'source/row_posix.cc',
- ],
- }],
- ]
},
- ], # targets
+ ], # targets.
}
# Local Variables:
diff --git a/files/libyuv_test.gyp b/files/libyuv_test.gyp
new file mode 100755
index 00000000..27cec8f4
--- /dev/null
+++ b/files/libyuv_test.gyp
@@ -0,0 +1,74 @@
+# Copyright 2011 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+{
+ 'targets': [
+ {
+ 'target_name': 'libyuv_unittest',
+ 'type': 'executable',
+ 'dependencies': [
+ 'libyuv.gyp:libyuv',
+ # The tests are based on gtest
+ 'testing/gtest.gyp:gtest',
+ 'testing/gtest.gyp:gtest_main',
+ ],
+ 'defines': [
+ 'LIBYUV_SVNREVISION="<!(svnversion -n)"',
+ # 'LIBYUV_USING_SHARED_LIBRARY',
+ ],
+ 'sources': [
+ # headers
+ 'unit_test/unit_test.h',
+
+ # sources
+ 'unit_test/compare_test.cc',
+ 'unit_test/cpu_test.cc',
+ 'unit_test/planar_test.cc',
+ 'unit_test/rotate_argb_test.cc',
+ 'unit_test/rotate_test.cc',
+ 'unit_test/scale_argb_test.cc',
+ 'unit_test/scale_test.cc',
+ 'unit_test/unit_test.cc',
+ 'unit_test/version_test.cc',
+ ],
+ 'conditions': [
+ ['OS=="linux"', {
+ 'cflags': [
+ '-fexceptions',
+ ],
+ }],
+ ], # conditions
+ },
+
+ {
+ 'target_name': 'compare',
+ 'type': 'executable',
+ 'dependencies': [
+ 'libyuv.gyp:libyuv',
+ ],
+ 'sources': [
+ # sources
+ 'util/compare.cc',
+ ],
+ 'conditions': [
+ ['OS=="linux"', {
+ 'cflags': [
+ '-fexceptions',
+ ],
+ }],
+ ], # conditions
+ },
+
+ ], # targets
+}
+
+# Local Variables:
+# tab-width:2
+# indent-tabs-mode:nil
+# End:
+# vim: set expandtab tabstop=2 shiftwidth=2:
diff --git a/files/source/compare.cc b/files/source/compare.cc
new file mode 100644
index 00000000..bf4a7dae
--- /dev/null
+++ b/files/source/compare.cc
@@ -0,0 +1,571 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/compare.h"
+
+#include <float.h>
+#include <math.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// hash seed of 5381 recommended.
+// Internal C version of HashDjb2 with int sized count for efficiency.
+static uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
+ uint32 hash = seed;
+ for (int i = 0; i < count; ++i) {
+ hash += (hash << 5) + src[i];
+ }
+ return hash;
+}
+
+// This module is for Visual C x86
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
+#define HAS_HASHDJB2_SSE41
+static const uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
+static const uvec32 kHashMul0 = {
+ 0x0c3525e1, // 33 ^ 15
+ 0xa3476dc1, // 33 ^ 14
+ 0x3b4039a1, // 33 ^ 13
+ 0x4f5f0981, // 33 ^ 12
+};
+static const uvec32 kHashMul1 = {
+ 0x30f35d61, // 33 ^ 11
+ 0x855cb541, // 33 ^ 10
+ 0x040a9121, // 33 ^ 9
+ 0x747c7101, // 33 ^ 8
+};
+static const uvec32 kHashMul2 = {
+ 0xec41d4e1, // 33 ^ 7
+ 0x4cfa3cc1, // 33 ^ 6
+ 0x025528a1, // 33 ^ 5
+ 0x00121881, // 33 ^ 4
+};
+static const uvec32 kHashMul3 = {
+ 0x00008c61, // 33 ^ 3
+ 0x00000441, // 33 ^ 2
+ 0x00000021, // 33 ^ 1
+ 0x00000001, // 33 ^ 0
+};
+
+// 27: 66 0F 38 40 C6 pmulld xmm0,xmm6
+// 44: 66 0F 38 40 DD pmulld xmm3,xmm5
+// 59: 66 0F 38 40 E5 pmulld xmm4,xmm5
+// 72: 66 0F 38 40 D5 pmulld xmm2,xmm5
+// 83: 66 0F 38 40 CD pmulld xmm1,xmm5
+#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
+ _asm _emit 0x40 _asm _emit reg
+
+__declspec(naked) __declspec(align(16))
+static uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov ecx, [esp + 8] // count
+ movd xmm0, [esp + 12] // seed
+
+ pxor xmm7, xmm7 // constant 0 for unpck
+ movdqa xmm6, kHash16x33
+
+ align 16
+ wloop:
+ movdqu xmm1, [eax] // src[0-15]
+ lea eax, [eax + 16]
+ pmulld(0xc6) // pmulld xmm0,xmm6 hash *= 33 ^ 16
+ movdqa xmm5, kHashMul0
+ movdqa xmm2, xmm1
+ punpcklbw xmm2, xmm7 // src[0-7]
+ movdqa xmm3, xmm2
+ punpcklwd xmm3, xmm7 // src[0-3]
+ pmulld(0xdd) // pmulld xmm3, xmm5
+ movdqa xmm5, kHashMul1
+ movdqa xmm4, xmm2
+ punpckhwd xmm4, xmm7 // src[4-7]
+ pmulld(0xe5) // pmulld xmm4, xmm5
+ movdqa xmm5, kHashMul2
+ punpckhbw xmm1, xmm7 // src[8-15]
+ movdqa xmm2, xmm1
+ punpcklwd xmm2, xmm7 // src[8-11]
+ pmulld(0xd5) // pmulld xmm2, xmm5
+ movdqa xmm5, kHashMul3
+ punpckhwd xmm1, xmm7 // src[12-15]
+ pmulld(0xcd) // pmulld xmm1, xmm5
+ paddd xmm3, xmm4 // add 16 results
+ paddd xmm1, xmm2
+ sub ecx, 16
+ paddd xmm1, xmm3
+
+ pshufd xmm2, xmm1, 14 // upper 2 dwords
+ paddd xmm1, xmm2
+ pshufd xmm2, xmm1, 1
+ paddd xmm1, xmm2
+ paddd xmm0, xmm1
+ jg wloop
+
+ movd eax, xmm0 // return hash
+ ret
+ }
+}
+
+#elif !defined(YUV_DISABLE_ASM) && \
+ (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
+// GCC 4.2 on OSX has link error when passing static or const to inline.
+// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
+#ifdef __APPLE__
+#define CONST
+#else
+#define CONST static const
+#endif
+#define HAS_HASHDJB2_SSE41
+CONST uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
+CONST uvec32 kHashMul0 = {
+ 0x0c3525e1, // 33 ^ 15
+ 0xa3476dc1, // 33 ^ 14
+ 0x3b4039a1, // 33 ^ 13
+ 0x4f5f0981, // 33 ^ 12
+};
+CONST uvec32 kHashMul1 = {
+ 0x30f35d61, // 33 ^ 11
+ 0x855cb541, // 33 ^ 10
+ 0x040a9121, // 33 ^ 9
+ 0x747c7101, // 33 ^ 8
+};
+CONST uvec32 kHashMul2 = {
+ 0xec41d4e1, // 33 ^ 7
+ 0x4cfa3cc1, // 33 ^ 6
+ 0x025528a1, // 33 ^ 5
+ 0x00121881, // 33 ^ 4
+};
+CONST uvec32 kHashMul3 = {
+ 0x00008c61, // 33 ^ 3
+ 0x00000441, // 33 ^ 2
+ 0x00000021, // 33 ^ 1
+ 0x00000001, // 33 ^ 0
+};
+static uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
+ uint32 hash;
+ asm volatile (
+ "movd %2,%%xmm0 \n"
+ "pxor %%xmm7,%%xmm7 \n"
+ "movdqa %4,%%xmm6 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "pmulld %%xmm6,%%xmm0 \n"
+ "movdqa %5,%%xmm5 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm7,%%xmm3 \n"
+ "pmulld %%xmm5,%%xmm3 \n"
+ "movdqa %6,%%xmm5 \n"
+ "movdqa %%xmm2,%%xmm4 \n"
+ "punpckhwd %%xmm7,%%xmm4 \n"
+ "pmulld %%xmm5,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "punpckhbw %%xmm7,%%xmm1 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklwd %%xmm7,%%xmm2 \n"
+ "pmulld %%xmm5,%%xmm2 \n"
+ "movdqa %8,%%xmm5 \n"
+ "punpckhwd %%xmm7,%%xmm1 \n"
+ "pmulld %%xmm5,%%xmm1 \n"
+ "paddd %%xmm4,%%xmm3 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "sub $0x10,%1 \n"
+ "paddd %%xmm3,%%xmm1 \n"
+ "pshufd $0xe,%%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "pshufd $0x1,%%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "jg 1b \n"
+ "movd %%xmm0,%3 \n"
+ : "+r"(src), // %0
+ "+r"(count), // %1
+ "+rm"(seed), // %2
+ "=g"(hash) // %3
+ : "m"(kHash16x33), // %4
+ "m"(kHashMul0), // %5
+ "m"(kHashMul1), // %6
+ "m"(kHashMul2), // %7
+ "m"(kHashMul3) // %8
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+ return hash;
+}
+#endif // HAS_HASHDJB2_SSE41
+
+// hash seed of 5381 recommended.
+LIBYUV_API
+uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
+ uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C;
+#if defined(HAS_HASHDJB2_SSE41)
+ if (TestCpuFlag(kCpuHasSSE41)) {
+ HashDjb2_SSE = HashDjb2_SSE41;
+ }
+#endif
+
+ const int kBlockSize = 1 << 15; // 32768;
+ while (count >= static_cast<uint64>(kBlockSize)) {
+ seed = HashDjb2_SSE(src, kBlockSize, seed);
+ src += kBlockSize;
+ count -= kBlockSize;
+ }
+ int remainder = static_cast<int>(count) & ~15;
+ if (remainder) {
+ seed = HashDjb2_SSE(src, remainder, seed);
+ src += remainder;
+ count -= remainder;
+ }
+ remainder = static_cast<int>(count) & 15;
+ if (remainder) {
+ seed = HashDjb2_C(src, remainder, seed);
+ }
+ return seed;
+}
+
+#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define HAS_SUMSQUAREERROR_NEON
+
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
+
+#elif !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
+#define HAS_SUMSQUAREERROR_SSE2
+__declspec(naked) __declspec(align(16))
+static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
+ int count) {
+ __asm {
+ mov eax, [esp + 4] // src_a
+ mov edx, [esp + 8] // src_b
+ mov ecx, [esp + 12] // count
+ pxor xmm0, xmm0
+ pxor xmm5, xmm5
+ sub edx, eax
+
+ align 16
+ wloop:
+ movdqa xmm1, [eax]
+ movdqa xmm2, [eax + edx]
+ lea eax, [eax + 16]
+ sub ecx, 16
+ movdqa xmm3, xmm1 // abs trick
+ psubusb xmm1, xmm2
+ psubusb xmm2, xmm3
+ por xmm1, xmm2
+ movdqa xmm2, xmm1
+ punpcklbw xmm1, xmm5
+ punpckhbw xmm2, xmm5
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ paddd xmm0, xmm1
+ paddd xmm0, xmm2
+ jg wloop
+
+ pshufd xmm1, xmm0, 0EEh
+ paddd xmm0, xmm1
+ pshufd xmm1, xmm0, 01h
+ paddd xmm0, xmm1
+ movd eax, xmm0
+ ret
+ }
+}
+
+#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
+#define HAS_SUMSQUAREERROR_SSE2
+static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
+ int count) {
+ uint32 sse;
+ asm volatile (
+ "pxor %%xmm0,%%xmm0 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+ "sub %0,%1 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm1 \n"
+ "movdqa (%0,%1,1),%%xmm2 \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "psubusb %%xmm2,%%xmm1 \n"
+ "psubusb %%xmm3,%%xmm2 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpckhbw %%xmm5,%%xmm2 \n"
+ "pmaddwd %%xmm1,%%xmm1 \n"
+ "pmaddwd %%xmm2,%%xmm2 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "jg 1b \n"
+
+ "pshufd $0xee,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "pshufd $0x1,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "movd %%xmm0,%3 \n"
+
+ : "+r"(src_a), // %0
+ "+r"(src_b), // %1
+ "+r"(count), // %2
+ "=g"(sse) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm5"
+#endif
+ );
+ return sse;
+}
+#endif
+
+static uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b,
+ int count) {
+ uint32 sse = 0u;
+ for (int i = 0; i < count; ++i) {
+ int diff = src_a[i] - src_b[i];
+ sse += static_cast<uint32>(diff * diff);
+ }
+ return sse;
+}
+
+LIBYUV_API
+uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
+ int count) {
+ uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
+ SumSquareError_C;
+#if defined(HAS_SUMSQUAREERROR_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SumSquareError = SumSquareError_NEON;
+ }
+#elif defined(HAS_SUMSQUAREERROR_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
+ // Note only used for multiples of 16 so count is not checked.
+ SumSquareError = SumSquareError_SSE2;
+ }
+#endif
+ // 32K values will fit a 32bit int return value from SumSquareError.
+ // After each block of 32K, accumulate into 64 bit int.
+ const int kBlockSize = 1 << 15; // 32768;
+ uint64 sse = 0;
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: sse)
+#endif
+ for (int i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
+ sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
+ }
+ src_a += count & ~(kBlockSize - 1);
+ src_b += count & ~(kBlockSize - 1);
+ int remainder = count & (kBlockSize - 1) & ~15;
+ if (remainder) {
+ sse += SumSquareError(src_a, src_b, remainder);
+ src_a += remainder;
+ src_b += remainder;
+ }
+ remainder = count & 15;
+ if (remainder) {
+ sse += SumSquareError_C(src_a, src_b, remainder);
+ }
+ return sse;
+}
+
+LIBYUV_API
+uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
+ const uint8* src_b, int stride_b,
+ int width, int height) {
+ uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
+ SumSquareError_C;
+#if defined(HAS_SUMSQUAREERROR_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SumSquareError = SumSquareError_NEON;
+ }
+#elif defined(HAS_SUMSQUAREERROR_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(src_a, 16) && IS_ALIGNED(stride_a, 16) &&
+ IS_ALIGNED(src_b, 16) && IS_ALIGNED(stride_b, 16)) {
+ SumSquareError = SumSquareError_SSE2;
+ }
+#endif
+
+ uint64 sse = 0;
+ for (int h = 0; h < height; ++h) {
+ sse += SumSquareError(src_a, src_b, width);
+ src_a += stride_a;
+ src_b += stride_b;
+ }
+
+ return sse;
+}
+
+LIBYUV_API
+double SumSquareErrorToPsnr(uint64 sse, uint64 count) {
+ double psnr;
+ if (sse > 0) {
+ double mse = static_cast<double>(count) / static_cast<double>(sse);
+ psnr = 10.0 * log10(255.0 * 255.0 * mse);
+ } else {
+ psnr = kMaxPsnr; // Limit to prevent divide by 0
+ }
+
+ if (psnr > kMaxPsnr)
+ psnr = kMaxPsnr;
+
+ return psnr;
+}
+
+LIBYUV_API
+double CalcFramePsnr(const uint8* src_a, int stride_a,
+ const uint8* src_b, int stride_b,
+ int width, int height) {
+ const uint64 samples = width * height;
+ const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a,
+ src_b, stride_b,
+ width, height);
+ return SumSquareErrorToPsnr(sse, samples);
+}
+
+LIBYUV_API
+double I420Psnr(const uint8* src_y_a, int stride_y_a,
+ const uint8* src_u_a, int stride_u_a,
+ const uint8* src_v_a, int stride_v_a,
+ const uint8* src_y_b, int stride_y_b,
+ const uint8* src_u_b, int stride_u_b,
+ const uint8* src_v_b, int stride_v_b,
+ int width, int height) {
+ const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a,
+ src_y_b, stride_y_b,
+ width, height);
+ const int width_uv = (width + 1) >> 1;
+ const int height_uv = (height + 1) >> 1;
+ const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a,
+ src_u_b, stride_u_b,
+ width_uv, height_uv);
+ const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a,
+ src_v_b, stride_v_b,
+ width_uv, height_uv);
+ const uint64 samples = width * height + 2 * (width_uv * height_uv);
+ const uint64 sse = sse_y + sse_u + sse_v;
+ return SumSquareErrorToPsnr(sse, samples);
+}
+
+static const int64 cc1 = 26634; // (64^2*(.01*255)^2
+static const int64 cc2 = 239708; // (64^2*(.03*255)^2
+
+static double Ssim8x8_C(const uint8* src_a, int stride_a,
+ const uint8* src_b, int stride_b) {
+ int64 sum_a = 0;
+ int64 sum_b = 0;
+ int64 sum_sq_a = 0;
+ int64 sum_sq_b = 0;
+ int64 sum_axb = 0;
+
+ for (int i = 0; i < 8; ++i) {
+ for (int j = 0; j < 8; ++j) {
+ sum_a += src_a[j];
+ sum_b += src_b[j];
+ sum_sq_a += src_a[j] * src_a[j];
+ sum_sq_b += src_b[j] * src_b[j];
+ sum_axb += src_a[j] * src_b[j];
+ }
+
+ src_a += stride_a;
+ src_b += stride_b;
+ }
+
+ const int64 count = 64;
+ // scale the constants by number of pixels
+ const int64 c1 = (cc1 * count * count) >> 12;
+ const int64 c2 = (cc2 * count * count) >> 12;
+
+ const int64 sum_a_x_sum_b = sum_a * sum_b;
+
+ const int64 ssim_n = (2 * sum_a_x_sum_b + c1) *
+ (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
+
+ const int64 sum_a_sq = sum_a*sum_a;
+ const int64 sum_b_sq = sum_b*sum_b;
+
+ const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) *
+ (count * sum_sq_a - sum_a_sq +
+ count * sum_sq_b - sum_b_sq + c2);
+
+ if (ssim_d == 0.0)
+ return DBL_MAX;
+ return ssim_n * 1.0 / ssim_d;
+}
+
+// We are using a 8x8 moving window with starting location of each 8x8 window
+// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
+// block boundaries to penalize blocking artifacts.
+LIBYUV_API
+double CalcFrameSsim(const uint8* src_a, int stride_a,
+ const uint8* src_b, int stride_b,
+ int width, int height) {
+ int samples = 0;
+ double ssim_total = 0;
+
+ double (*Ssim8x8)(const uint8* src_a, int stride_a,
+ const uint8* src_b, int stride_b);
+
+ Ssim8x8 = Ssim8x8_C;
+
+ // sample point start with each 4x4 location
+ for (int i = 0; i < height - 8; i += 4) {
+ for (int j = 0; j < width - 8; j += 4) {
+ ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b);
+ samples++;
+ }
+
+ src_a += stride_a * 4;
+ src_b += stride_b * 4;
+ }
+
+ ssim_total /= samples;
+ return ssim_total;
+}
+
+LIBYUV_API
+double I420Ssim(const uint8* src_y_a, int stride_y_a,
+ const uint8* src_u_a, int stride_u_a,
+ const uint8* src_v_a, int stride_v_a,
+ const uint8* src_y_b, int stride_y_b,
+ const uint8* src_u_b, int stride_u_b,
+ const uint8* src_v_b, int stride_v_b,
+ int width, int height) {
+ const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a,
+ src_y_b, stride_y_b, width, height);
+ const int width_uv = (width + 1) >> 1;
+ const int height_uv = (height + 1) >> 1;
+ const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a,
+ src_u_b, stride_u_b,
+ width_uv, height_uv);
+ const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a,
+ src_v_b, stride_v_b,
+ width_uv, height_uv);
+ return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/compare_neon.cc b/files/source/compare_neon.cc
new file mode 100644
index 00000000..d8b375b8
--- /dev/null
+++ b/files/source/compare_neon.cc
@@ -0,0 +1,62 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
+
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
+ volatile uint32 sse;
+ asm volatile (
+ "vmov.u8 q8, #0 \n"
+ "vmov.u8 q10, #0 \n"
+ "vmov.u8 q9, #0 \n"
+ "vmov.u8 q11, #0 \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.u8 {q0}, [%0]! \n"
+ "vld1.u8 {q1}, [%1]! \n"
+ "subs %2, %2, #16 \n"
+ "vsubl.u8 q2, d0, d2 \n"
+ "vsubl.u8 q3, d1, d3 \n"
+ "vmlal.s16 q8, d4, d4 \n"
+ "vmlal.s16 q9, d6, d6 \n"
+ "vmlal.s16 q10, d5, d5 \n"
+ "vmlal.s16 q11, d7, d7 \n"
+ "bgt 1b \n"
+
+ "vadd.u32 q8, q8, q9 \n"
+ "vadd.u32 q10, q10, q11 \n"
+ "vadd.u32 q11, q8, q10 \n"
+ "vpaddl.u32 q1, q11 \n"
+ "vadd.u64 d0, d2, d3 \n"
+ "vmov.32 %3, d0[0] \n"
+ : "+r"(src_a),
+ "+r"(src_b),
+ "+r"(count),
+ "=r"(sse)
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+ return sse;
+}
+
+#endif // __ARM_NEON__
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
diff --git a/files/source/conversion_tables.h b/files/source/conversion_tables.h
index 9a328649..ef3ebf36 100644
--- a/files/source/conversion_tables.h
+++ b/files/source/conversion_tables.h
@@ -18,7 +18,10 @@
#ifndef LIBYUV_SOURCE_CONVERSION_TABLES_H_
#define LIBYUV_SOURCE_CONVERSION_TABLES_H_
+#ifdef __cplusplus
namespace libyuv {
+extern "C" {
+#endif
/******************************************************************************
* YUV TO RGB approximation
@@ -197,7 +200,10 @@ namespace libyuv {
Vcg(244),Vcg(245),Vcg(246),Vcg(247),Vcg(248),Vcg(249),Vcg(250),Vcg(251),
Vcg(252),Vcg(253),Vcg(254),Vcg(255)};
-} // namespace libyuv
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
#endif
diff --git a/files/source/convert.cc b/files/source/convert.cc
index 8154dcb7..0882c92b 100644
--- a/files/source/convert.cc
+++ b/files/source/convert.cc
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@@ -10,174 +10,131 @@
#include "libyuv/convert.h"
-#include "conversion_tables.h"
#include "libyuv/basic_types.h"
#include "libyuv/cpu_id.h"
-#include "row.h"
-
-//#define SCALEOPT //Currently for windows only. June 2010
-
-#ifdef SCALEOPT
-#include <emmintrin.h>
+#include "libyuv/format_conversion.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
#endif
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/video_common.h"
+#include "libyuv/row.h"
+#ifdef __cplusplus
namespace libyuv {
+extern "C" {
+#endif
-static inline uint8 Clip(int32 val) {
- if (val < 0) {
- return (uint8) 0;
- } else if (val > 255){
- return (uint8) 255;
- }
- return (uint8) val;
-}
-
-int I420ToRGB24(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_frame, int dst_stride_frame,
- int width, int height) {
- if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) {
+// Copy I420 with optional flipping
+LIBYUV_API
+int I420Copy(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
return -1;
}
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ int halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
- // RGB orientation - bottom up
- // TODO(fbarchard): support inversion
- uint8* out = dst_frame + dst_stride_frame * height - dst_stride_frame;
- uint8* out2 = out - dst_stride_frame;
- int h, w;
- int tmp_r, tmp_g, tmp_b;
- const uint8 *y1, *y2 ,*u, *v;
- y1 = src_y;
- y2 = y1 + src_stride_y;
- u = src_u;
- v = src_v;
- for (h = ((height + 1) >> 1); h > 0; h--){
- // 2 rows at a time, 2 y's at a time
- for (w = 0; w < ((width + 1) >> 1); w++){
- // Vertical and horizontal sub-sampling
- tmp_r = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8);
- tmp_g = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
- tmp_b = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128) >> 8);
- out[0] = Clip(tmp_b);
- out[1] = Clip(tmp_g);
- out[2] = Clip(tmp_r);
-
- tmp_r = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128) >> 8);
- tmp_g = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
- tmp_b = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8);
- out[3] = Clip(tmp_b);
- out[4] = Clip(tmp_g);
- out[5] = Clip(tmp_r);
-
- tmp_r = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8);
- tmp_g = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
- tmp_b = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8);
- out2[0] = Clip(tmp_b);
- out2[1] = Clip(tmp_g);
- out2[2] = Clip(tmp_r);
-
- tmp_r = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8);
- tmp_g = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
- tmp_b = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8);
- out2[3] = Clip(tmp_b);
- out2[4] = Clip(tmp_g);
- out2[5] = Clip(tmp_r);
-
- out += 6;
- out2 += 6;
- y1 += 2;
- y2 += 2;
- u++;
- v++;
- }
- y1 += src_stride_y + src_stride_y - width;
- y2 += src_stride_y + src_stride_y - width;
- u += src_stride_u - ((width + 1) >> 1);
- v += src_stride_v - ((width + 1) >> 1);
- out -= dst_stride_frame * 3;
- out2 -= dst_stride_frame * 3;
- } // end height for
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+ CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
return 0;
}
-// Little Endian...
-int I420ToARGB4444(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_frame, int dst_stride_frame,
- int width, int height) {
- if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) {
- return -1;
+// Move to row_win etc.
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
+#define HAS_HALFROW_SSE2
+__declspec(naked) __declspec(align(16))
+static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
+ uint8* dst_uv, int pix) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_uv
+ mov edx, [esp + 4 + 8] // src_uv_stride
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ sub edi, eax
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax]
+ pavgb xmm0, [eax + edx]
+ sub ecx, 16
+ movdqa [eax + edi], xmm0
+ lea eax, [eax + 16]
+ jg convertloop
+ pop edi
+ ret
}
+}
- // RGB orientation - bottom up
- uint8* out = dst_frame + dst_stride_frame * (height - 1);
- uint8* out2 = out - dst_stride_frame;
- int tmp_r, tmp_g, tmp_b;
- const uint8 *y1,*y2, *u, *v;
- y1 = src_y;
- y2 = y1 + src_stride_y;
- u = src_u;
- v = src_v;
- int h, w;
-
- for (h = ((height + 1) >> 1); h > 0; h--) {
- // 2 rows at a time, 2 y's at a time
- for (w = 0; w < ((width + 1) >> 1); w++) {
- // Vertical and horizontal sub-sampling
- // Convert to RGB888 and re-scale to 4 bits
- tmp_r = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8);
- tmp_g = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
- tmp_b = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128) >> 8);
- out[0] =(uint8)((Clip(tmp_g) & 0xf0) + (Clip(tmp_b) >> 4));
- out[1] = (uint8)(0xf0 + (Clip(tmp_r) >> 4));
-
- tmp_r = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128) >> 8);
- tmp_g = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
- tmp_b = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8);
- out[2] = (uint8)((Clip(tmp_g) & 0xf0 ) + (Clip(tmp_b) >> 4));
- out[3] = (uint8)(0xf0 + (Clip(tmp_r) >> 4));
-
- tmp_r = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8);
- tmp_g = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
- tmp_b = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8);
- out2[0] = (uint8)((Clip(tmp_g) & 0xf0 ) + (Clip(tmp_b) >> 4));
- out2[1] = (uint8) (0xf0 + (Clip(tmp_r) >> 4));
-
- tmp_r = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8);
- tmp_g = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
- tmp_b = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8);
- out2[2] = (uint8)((Clip(tmp_g) & 0xf0 ) + (Clip(tmp_b) >> 4));
- out2[3] = (uint8)(0xf0 + (Clip(tmp_r) >> 4));
-
- out += 4;
- out2 += 4;
- y1 += 2;
- y2 += 2;
- u++;
- v++;
- }
- y1 += 2 * src_stride_y - width;
- y2 += 2 * src_stride_y - width;
- u += src_stride_u - ((width + 1) >> 1);
- v += src_stride_v - ((width + 1) >> 1);
- out -= (dst_stride_frame + width) * 2;
- out2 -= (dst_stride_frame + width) * 2;
- } // end height for
- return 0;
+#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
+#define HAS_HALFROW_SSE2
+static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
+ uint8* dst_uv, int pix) {
+ asm volatile (
+ "sub %0,%1 \n"
+ ".p2align 4 \n"
+"1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "pavgb (%0,%3),%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0,(%0,%1) \n"
+ "lea 0x10(%0),%0 \n"
+ "jg 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_uv), // %1
+ "+r"(pix) // %2
+ : "r"(static_cast<intptr_t>(src_uv_stride)) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0"
+#endif
+);
}
+#endif
+static void HalfRow_C(const uint8* src_uv, int src_uv_stride,
+ uint8* dst_uv, int pix) {
+ for (int x = 0; x < pix; ++x) {
+ dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
+ }
+}
-int I420ToRGB565(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_frame, int dst_stride_frame,
- int width, int height) {
- if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) {
+LIBYUV_API
+int I422ToI420(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
return -1;
}
-
// Negative height means invert the image.
if (height < 0) {
height = -height;
@@ -188,717 +145,1937 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
src_stride_u = -src_stride_u;
src_stride_v = -src_stride_v;
}
- uint16* out = (uint16*)(dst_frame) + dst_stride_frame * (height - 1);
- uint16* out2 = out - dst_stride_frame;
-
- int tmp_r, tmp_g, tmp_b;
- const uint8* y1,* y2, * u, * v;
- y1 = src_y;
- y2 = y1 + src_stride_y;
- u = src_u;
- v = src_v;
- int h, w;
-
- for (h = ((height + 1) >> 1); h > 0; h--){
- // 2 rows at a time, 2 y's at a time
- for (w = 0; w < ((width + 1) >> 1); w++){
- // Vertical and horizontal sub-sampling
- // 1. Convert to RGB888
- // 2. Shift to adequate location (in the 16 bit word) - RGB 565
-
- tmp_r = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8);
- tmp_g = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
- tmp_b = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128) >> 8);
- out[0] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g)
- & 0xfc) << 3) + (Clip(tmp_b) >> 3);
-
- tmp_r = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128) >> 8);
- tmp_g = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
- tmp_b = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8);
- out[1] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g)
- & 0xfc) << 3) + (Clip(tmp_b ) >> 3);
-
- tmp_r = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8);
- tmp_g = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
- tmp_b = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8);
- out2[0] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g)
- & 0xfc) << 3) + (Clip(tmp_b) >> 3);
-
- tmp_r = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8);
- tmp_g = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
- tmp_b = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8);
- out2[1] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g)
- & 0xfc) << 3) + (Clip(tmp_b) >> 3);
-
- y1 += 2;
- y2 += 2;
- out += 2;
- out2 += 2;
- u++;
- v++;
- }
- y1 += 2 * src_stride_y - width;
- y2 += 2 * src_stride_y - width;
- u += src_stride_u - ((width + 1) >> 1);
- v += src_stride_v - ((width + 1) >> 1);
- out -= 2 * dst_stride_frame + width;
- out2 -= 2 * dst_stride_frame + width;
+ int halfwidth = (width + 1) >> 1;
+ void (*HalfRow)(const uint8* src_uv, int src_uv_stride,
+ uint8* dst_uv, int pix) = HalfRow_C;
+#if defined(HAS_HALFROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(halfwidth, 16) &&
+ IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
+ IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
+ IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
+ IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
+ HalfRow = HalfRow_SSE2;
+ }
+#endif
+
+ // Copy Y plane
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+
+ // SubSample U plane.
+ int y;
+ for (y = 0; y < height - 1; y += 2) {
+ HalfRow(src_u, src_stride_u, dst_u, halfwidth);
+ src_u += src_stride_u * 2;
+ dst_u += dst_stride_u;
+ }
+ if (height & 1) {
+ HalfRow(src_u, 0, dst_u, halfwidth);
+ }
+
+ // SubSample V plane.
+ for (y = 0; y < height - 1; y += 2) {
+ HalfRow(src_v, src_stride_v, dst_v, halfwidth);
+ src_v += src_stride_v * 2;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ HalfRow(src_v, 0, dst_v, halfwidth);
}
return 0;
}
+// Blends 32x2 pixels to 16x1
+// source in scale.cc
+#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define HAS_SCALEROWDOWN2_NEON
+void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+#elif !defined(YUV_DISABLE_ASM) && \
+ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+
+void ScaleRowDown2Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+#endif
+void ScaleRowDown2Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
-int I420ToARGB1555(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_frame, int dst_stride_frame,
- int width, int height) {
- if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) {
+LIBYUV_API
+int I444ToI420(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
return -1;
}
- uint16* out = (uint16*)(dst_frame) + dst_stride_frame * (height - 1);
- uint16* out2 = out - dst_stride_frame ;
- int32 tmp_r, tmp_g, tmp_b;
- const uint8 *y1,*y2, *u, *v;
- int h, w;
-
- y1 = src_y;
- y2 = y1 + src_stride_y;
- u = src_u;
- v = src_v;
-
- for (h = ((height + 1) >> 1); h > 0; h--){
- // 2 rows at a time, 2 y's at a time
- for (w = 0; w < ((width + 1) >> 1); w++){
- // Vertical and horizontal sub-sampling
- // 1. Convert to RGB888
- // 2. Shift to adequate location (in the 16 bit word) - RGB 555
- // 3. Add 1 for alpha value
- tmp_r = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8);
- tmp_g = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
- tmp_b = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128) >> 8);
- out[0] = (uint16)(0x8000 + ((Clip(tmp_r) & 0xf8) << 10) +
- ((Clip(tmp_g) & 0xf8) << 3) + (Clip(tmp_b) >> 3));
-
- tmp_r = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128) >> 8);
- tmp_g = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
- tmp_b = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8);
- out[1] = (uint16)(0x8000 + ((Clip(tmp_r) & 0xf8) << 10) +
- ((Clip(tmp_g) & 0xf8) << 3) + (Clip(tmp_b) >> 3));
-
- tmp_r = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8);
- tmp_g = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
- tmp_b = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8);
- out2[0] = (uint16)(0x8000 + ((Clip(tmp_r) & 0xf8) << 10) +
- ((Clip(tmp_g) & 0xf8) << 3) + (Clip(tmp_b) >> 3));
-
- tmp_r = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8);
- tmp_g = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8);
- tmp_b = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8);
- out2[1] = (uint16)(0x8000 + ((Clip(tmp_r) & 0xf8) << 10) +
- ((Clip(tmp_g) & 0xf8) << 3) + (Clip(tmp_b) >> 3));
-
- y1 += 2;
- y2 += 2;
- out += 2;
- out2 += 2;
- u++;
- v++;
- }
- y1 += 2 * src_stride_y - width;
- y2 += 2 * src_stride_y - width;
- u += src_stride_u - ((width + 1) >> 1);
- v += src_stride_v - ((width + 1) >> 1);
- out -= 2 * dst_stride_frame + width;
- out2 -= 2 * dst_stride_frame + width;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+ int halfwidth = (width + 1) >> 1;
+ void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) = ScaleRowDown2Int_C;
+#if defined(HAS_SCALEROWDOWN2_NEON)
+ if (TestCpuFlag(kCpuHasNEON) &&
+ IS_ALIGNED(halfwidth, 16)) {
+ ScaleRowDown2 = ScaleRowDown2Int_NEON;
+ }
+#elif defined(HAS_SCALEROWDOWN2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(halfwidth, 16) &&
+ IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
+ IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
+ IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
+ IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
+ ScaleRowDown2 = ScaleRowDown2Int_SSE2;
+ }
+#endif
+
+ // Copy Y plane
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+
+ // SubSample U plane.
+ int y;
+ for (y = 0; y < height - 1; y += 2) {
+ ScaleRowDown2(src_u, src_stride_u, dst_u, halfwidth);
+ src_u += src_stride_u * 2;
+ dst_u += dst_stride_u;
+ }
+ if (height & 1) {
+ ScaleRowDown2(src_u, 0, dst_u, halfwidth);
+ }
+
+ // SubSample V plane.
+ for (y = 0; y < height - 1; y += 2) {
+ ScaleRowDown2(src_v, src_stride_v, dst_v, halfwidth);
+ src_v += src_stride_v * 2;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ ScaleRowDown2(src_v, 0, dst_v, halfwidth);
}
return 0;
}
+// use Bilinear for upsampling chroma
+void ScalePlaneBilinear(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr);
-int I420ToYUY2(const uint8* src_y, int src_stride_y,
+// 411 chroma is 1/4 width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+LIBYUV_API
+int I411ToI420(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
- uint8* dst_frame, int dst_stride_frame,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
int width, int height) {
- if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
return -1;
}
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_u = dst_u + (height - 1) * dst_stride_u;
+ dst_v = dst_v + (height - 1) * dst_stride_v;
+ dst_stride_y = -dst_stride_y;
+ dst_stride_u = -dst_stride_u;
+ dst_stride_v = -dst_stride_v;
+ }
- const uint8* in1 = src_y;
- const uint8* in2 = src_y + src_stride_y;
-
- uint8* out1 = dst_frame;
- uint8* out2 = dst_frame + dst_stride_frame;
-
- // YUY2 - Macro-pixel = 2 image pixels
- // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
-#ifndef SCALEOPT
- for (int i = 0; i < ((height + 1) >> 1); i++){
- for (int j = 0; j < ((width + 1) >> 1); j++){
- out1[0] = in1[0];
- out1[1] = *src_u;
- out1[2] = in1[1];
- out1[3] = *src_v;
-
- out2[0] = in2[0];
- out2[1] = *src_u;
- out2[2] = in2[1];
- out2[3] = *src_v;
- out1 += 4;
- out2 += 4;
- src_u++;
- src_v++;
- in1 += 2;
- in2 += 2;
- }
- in1 += 2 * src_stride_y - width;
- in2 += 2 * src_stride_y - width;
- src_u += src_stride_u - ((width + 1) >> 1);
- src_v += src_stride_v - ((width + 1) >> 1);
- out1 += dst_stride_frame + dst_stride_frame - 2 * width;
- out2 += dst_stride_frame + dst_stride_frame - 2 * width;
+ // Copy Y plane
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
}
-#else
- for (WebRtc_UWord32 i = 0; i < ((height + 1) >> 1);i++) {
- int32 width__ = (width >> 4);
- _asm
- {
- ;pusha
- mov eax, DWORD PTR [in1] ;1939.33
- mov ecx, DWORD PTR [in2] ;1939.33
- mov ebx, DWORD PTR [src_u] ;1939.33
- mov edx, DWORD PTR [src_v] ;1939.33
- loop0:
- movq xmm6, QWORD PTR [ebx] ;src_u
- movq xmm0, QWORD PTR [edx] ;src_v
- punpcklbw xmm6, xmm0 ;src_u, src_v mix
- ;movdqa xmm1, xmm6
- ;movdqa xmm2, xmm6
- ;movdqa xmm4, xmm6
-
- movdqu xmm3, XMMWORD PTR [eax] ;in1
- movdqa xmm1, xmm3
- punpcklbw xmm1, xmm6 ;in1, src_u, in1, src_v
- mov esi, DWORD PTR [out1]
- movdqu XMMWORD PTR [esi], xmm1 ;write to out1
-
- movdqu xmm5, XMMWORD PTR [ecx] ;in2
- movdqa xmm2, xmm5
- punpcklbw xmm2, xmm6 ;in2, src_u, in2, src_v
- mov edi, DWORD PTR [out2]
- movdqu XMMWORD PTR [edi], xmm2 ;write to out2
-
- punpckhbw xmm3, xmm6 ;in1, src_u, in1, src_v again
- movdqu XMMWORD PTR [esi+16], xmm3 ;write to out1 again
- add esi, 32
- mov DWORD PTR [out1], esi
-
- punpckhbw xmm5, xmm6 ;src_u, in2, src_v again
- movdqu XMMWORD PTR [edi+16], xmm5 ;write to out2 again
- add edi, 32
- mov DWORD PTR [out2], edi
-
- add ebx, 8
- add edx, 8
- add eax, 16
- add ecx, 16
-
- mov esi, DWORD PTR [width__]
- sub esi, 1
- mov DWORD PTR [width__], esi
- jg loop0
-
- mov DWORD PTR [in1], eax ;1939.33
- mov DWORD PTR [in2], ecx ;1939.33
- mov DWORD PTR [src_u], ebx ;1939.33
- mov DWORD PTR [src_v], edx ;1939.33
-
- ;popa
- emms
- }
- in1 += 2 * src_stride_y - width;
- in2 += 2 * src_stride_y - width;
- out1 += dst_stride_frame + dst_stride_frame - 2 * width;
- out2 += dst_stride_frame + dst_stride_frame - 2 * width;
+
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ int quarterwidth = (width + 3) >> 2;
+
+ // Resample U plane.
+ ScalePlaneBilinear(quarterwidth, height, // from 1/4 width, 1x height
+ halfwidth, halfheight, // to 1/2 width, 1/2 height
+ src_stride_u,
+ dst_stride_u,
+ src_u, dst_u);
+
+ // Resample V plane.
+ ScalePlaneBilinear(quarterwidth, height, // from 1/4 width, 1x height
+ halfwidth, halfheight, // to 1/2 width, 1/2 height
+ src_stride_v,
+ dst_stride_v,
+ src_v, dst_v);
+ return 0;
+}
+
+// I400 is greyscale typically used in MJPG
+LIBYUV_API
+int I400ToI420(const uint8* src_y, int src_stride_y,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_y || !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128);
+ SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128);
+ return 0;
+}
+
+static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
+ uint8* dst, int dst_stride_frame,
+ int width, int height) {
+ void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_COPYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) {
+ CopyRow = CopyRow_NEON;
+ }
+#elif defined(HAS_COPYROW_X86)
+ if (IS_ALIGNED(width, 4)) {
+ CopyRow = CopyRow_X86;
+#if defined(HAS_COPYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(width, 32) && IS_ALIGNED(src, 16) &&
+ IS_ALIGNED(src_stride_0, 16) && IS_ALIGNED(src_stride_1, 16) &&
+ IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride_frame, 16)) {
+ CopyRow = CopyRow_SSE2;
+ }
+#endif
+ }
+#endif
+
+ // Copy plane
+ for (int y = 0; y < height - 1; y += 2) {
+ CopyRow(src, dst, width);
+ CopyRow(src + src_stride_0, dst + dst_stride_frame, width);
+ src += src_stride_0 + src_stride_1;
+ dst += dst_stride_frame * 2;
+ }
+ if (height & 1) {
+ CopyRow(src, dst, width);
+ }
+}
+
+// Support converting from FOURCC_M420
+// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for
+// easy conversion to I420.
+// M420 format description:
+// M420 is row biplanar 420: 2 rows of Y and 1 row of UV.
+// Chroma is half width / half height. (420)
+// src_stride_m420 is row planar. Normally this will be the width in pixels.
+// The UV plane is half width, but 2 values, so src_stride_m420 applies to
+// this as well as the two Y planes.
+static int X420ToI420(const uint8* src_y,
+ int src_stride_y0, int src_stride_y1,
+ const uint8* src_uv, int src_stride_uv,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_y || !src_uv ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ int halfheight = (height + 1) >> 1;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_u = dst_u + (halfheight - 1) * dst_stride_u;
+ dst_v = dst_v + (halfheight - 1) * dst_stride_v;
+ dst_stride_y = -dst_stride_y;
+ dst_stride_u = -dst_stride_u;
+ dst_stride_v = -dst_stride_v;
+ }
+
+ int halfwidth = (width + 1) >> 1;
+ void (*SplitUV)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
+ SplitUV_C;
+#if defined(HAS_SPLITUV_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 16)) {
+ SplitUV = SplitUV_NEON;
+ }
+#elif defined(HAS_SPLITUV_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(halfwidth, 16) &&
+ IS_ALIGNED(src_uv, 16) && IS_ALIGNED(src_stride_uv, 16) &&
+ IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
+ IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
+ SplitUV = SplitUV_SSE2;
}
#endif
+
+ if (dst_y) {
+ CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y,
+ width, height);
+ }
+
+ int halfheight = (height + 1) >> 1;
+ for (int y = 0; y < halfheight; ++y) {
+ // Copy a row of UV.
+ SplitUV(src_uv, dst_u, dst_v, halfwidth);
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ src_uv += src_stride_uv;
+ }
return 0;
}
-int I420ToUYVY(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_frame, int dst_stride_frame,
+// Convert NV12 to I420.
+LIBYUV_API
+int NV12ToI420(const uint8* src_y, int src_stride_y,
+ const uint8* src_uv, int src_stride_uv,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
int width, int height) {
- if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) {
+ return X420ToI420(src_y, src_stride_y, src_stride_y,
+ src_uv, src_stride_uv,
+ dst_y, dst_stride_y,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ width, height);
+}
+
+// Convert M420 to I420.
+LIBYUV_API
+int M420ToI420(const uint8* src_m420, int src_stride_m420,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2,
+ src_m420 + src_stride_m420 * 2, src_stride_m420 * 3,
+ dst_y, dst_stride_y,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ width, height);
+}
+
+// Convert Q420 to I420.
+// Format is rows of YY/YUYV
+LIBYUV_API
+int Q420ToI420(const uint8* src_y, int src_stride_y,
+ const uint8* src_yuy2, int src_stride_yuy2,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_y || !src_yuy2 ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
return -1;
}
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ int halfheight = (height + 1) >> 1;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_u = dst_u + (halfheight - 1) * dst_stride_u;
+ dst_v = dst_v + (halfheight - 1) * dst_stride_v;
+ dst_stride_y = -dst_stride_y;
+ dst_stride_u = -dst_stride_u;
+ dst_stride_v = -dst_stride_v;
+ }
+ // CopyRow for rows of just Y in Q420 copied to Y plane of I420.
+ void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_COPYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) {
+ CopyRow = CopyRow_NEON;
+ }
+#endif
+#if defined(HAS_COPYROW_X86)
+ if (IS_ALIGNED(width, 4)) {
+ CopyRow = CopyRow_X86;
+ }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
+ IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
+ IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ CopyRow = CopyRow_SSE2;
+ }
+#endif
- int i = 0;
- const uint8* y1 = src_y;
- const uint8* y2 = y1 + src_stride_y;
- const uint8* u = src_u;
- const uint8* v = src_v;
-
- uint8* out1 = dst_frame;
- uint8* out2 = dst_frame + dst_stride_frame;
-
- // Macro-pixel = 2 image pixels
- // U0Y0V0Y1....U2Y2V2Y3...U4Y4V4Y5.....
-
-#ifndef SCALEOPT
- for (; i < ((height + 1) >> 1); i++) {
- for (int j = 0; j < ((width + 1) >> 1); j++) {
- out1[0] = *u;
- out1[1] = y1[0];
- out1[2] = *v;
- out1[3] = y1[1];
-
- out2[0] = *u;
- out2[1] = y2[0];
- out2[2] = *v;
- out2[3] = y2[1];
- out1 += 4;
- out2 += 4;
- u++;
- v++;
- y1 += 2;
- y2 += 2;
- }
- y1 += 2 * src_stride_y - width;
- y2 += 2 * src_stride_y - width;
- u += src_stride_u - ((width + 1) >> 1);
- v += src_stride_v - ((width + 1) >> 1);
- out1 += 2 * (dst_stride_frame - width);
- out2 += 2 * (dst_stride_frame - width);
+ void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+ int pix) = YUY2ToUV422Row_C;
+ void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) =
+ YUY2ToYRow_C;
+#if defined(HAS_YUY2TOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ if (width > 16) {
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
+ YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
+ YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
+ if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
+ YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ YUY2ToYRow = YUY2ToYRow_SSE2;
+ }
+ }
+ }
}
-#else
- for (; i < (height >> 1);i++) {
- int32 width__ = (width >> 4);
- _asm
- {
- ;pusha
- mov eax, DWORD PTR [in1] ;1939.33
- mov ecx, DWORD PTR [in2] ;1939.33
- mov ebx, DWORD PTR [src_u] ;1939.33
- mov edx, DWORD PTR [src_v] ;1939.33
-loop0:
- movq xmm6, QWORD PTR [ebx] ;src_u
- movq xmm0, QWORD PTR [edx] ;src_v
- punpcklbw xmm6, xmm0 ;src_u, src_v mix
- movdqa xmm1, xmm6
- movdqa xmm2, xmm6
- movdqa xmm4, xmm6
-
- movdqu xmm3, XMMWORD PTR [eax] ;in1
- punpcklbw xmm1, xmm3 ;src_u, in1, src_v
- mov esi, DWORD PTR [out1]
- movdqu XMMWORD PTR [esi], xmm1 ;write to out1
-
- movdqu xmm5, XMMWORD PTR [ecx] ;in2
- punpcklbw xmm2, xmm5 ;src_u, in2, src_v
- mov edi, DWORD PTR [out2]
- movdqu XMMWORD PTR [edi], xmm2 ;write to out2
-
- punpckhbw xmm4, xmm3 ;src_u, in1, src_v again
- movdqu XMMWORD PTR [esi+16], xmm4 ;write to out1 again
- add esi, 32
- mov DWORD PTR [out1], esi
-
- punpckhbw xmm6, xmm5 ;src_u, in2, src_v again
- movdqu XMMWORD PTR [edi+16], xmm6 ;write to out2 again
- add edi, 32
- mov DWORD PTR [out2], edi
-
- add ebx, 8
- add edx, 8
- add eax, 16
- add ecx, 16
-
- mov esi, DWORD PTR [width__]
- sub esi, 1
- mov DWORD PTR [width__], esi
- jg loop0
-
- mov DWORD PTR [in1], eax ;1939.33
- mov DWORD PTR [in2], ecx ;1939.33
- mov DWORD PTR [src_u], ebx ;1939.33
- mov DWORD PTR [src_v], edx ;1939.33
-
- ;popa
- emms
- }
- in1 += width;
- in2 += width;
- out1 += 2 * (dst_stride_frame - width);
- out2 += 2 * (dst_stride_frame - width);
+#elif defined(HAS_YUY2TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ if (width > 8) {
+ YUY2ToYRow = YUY2ToYRow_Any_NEON;
+ if (width > 16) {
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
+ }
+ }
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToYRow = YUY2ToYRow_NEON;
+ YUY2ToUV422Row = YUY2ToUV422Row_NEON;
+ }
}
#endif
+
+ for (int y = 0; y < height - 1; y += 2) {
+ CopyRow(src_y, dst_y, width);
+ src_y += src_stride_y;
+ dst_y += dst_stride_y;
+
+ YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
+ YUY2ToYRow(src_yuy2, dst_y, width);
+ src_yuy2 += src_stride_yuy2;
+ dst_y += dst_stride_y;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ CopyRow(src_y, dst_y, width);
+ YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
+ }
return 0;
}
+// Test if over reading on source is safe.
+// TODO(fbarchard): Find more efficient solution to safely do odd sizes.
+// Macros to control read policy, from slowest to fastest:
+// READSAFE_NEVER - disables read ahead on systems with strict memory reads
+// READSAFE_ODDHEIGHT - last row of odd height done with C.
+// This policy assumes that the caller handles the last row of an odd height
+// image using C.
+// READSAFE_PAGE - enable read ahead within same page.
+// A page is 4096 bytes. When reading ahead, if the last pixel is near the
+// end the page, and a read spans the page into the next page, a memory
+// exception can occur if that page has not been allocated, or is a guard
+// page. This setting ensures the overread is within the same page.
+// READSAFE_ALWAYS - enables read ahead on systems without memory exceptions
+// or where buffers are padded by 64 bytes.
+
+#if defined(HAS_RGB24TOARGBROW_SSSE3) || \
+ defined(HAS_RGB24TOARGBROW_SSSE3) || \
+ defined(HAS_RAWTOARGBROW_SSSE3) || \
+ defined(HAS_RGB565TOARGBROW_SSE2) || \
+ defined(HAS_ARGB1555TOARGBROW_SSE2) || \
+ defined(HAS_ARGB4444TOARGBROW_SSE2)
+
+#define READSAFE_ODDHEIGHT
+
+static bool TestReadSafe(const uint8* src_yuy2, int src_stride_yuy2,
+ int width, int height, int bpp, int overread) {
+ if (width > kMaxStride) {
+ return false;
+ }
+#if defined(READSAFE_ALWAYS)
+ return true;
+#elif defined(READSAFE_NEVER)
+ return false;
+#elif defined(READSAFE_ODDHEIGHT)
+ if (!(width & 15) ||
+ (src_stride_yuy2 >= 0 && (height & 1) && width * bpp >= overread)) {
+ return true;
+ }
+ return false;
+#elif defined(READSAFE_PAGE)
+ if (src_stride_yuy2 >= 0) {
+ src_yuy2 += (height - 1) * src_stride_yuy2;
+ }
+ uintptr_t last_adr = (uintptr_t)(src_yuy2) + width * bpp - 1;
+ uintptr_t last_read_adr = last_adr + overread - 1;
+ if (((last_adr ^ last_read_adr) & ~4095) == 0) {
+ return true;
+ }
+ return false;
+#endif
+}
+#endif
-int NV12ToRGB565(const uint8* src_y, int src_stride_y,
- const uint8* src_uv, int src_stride_uv,
- uint8* dst_frame, int dst_stride_frame,
- int width, int height) {
- if (src_y == NULL || src_uv == NULL || dst_frame == NULL) {
- return -1;
+// Convert YUY2 to I420.
+LIBYUV_API
+int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+ src_stride_yuy2 = -src_stride_yuy2;
+ }
+ void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+ void (*YUY2ToYRow)(const uint8* src_yuy2,
+ uint8* dst_y, int pix);
+ YUY2ToYRow = YUY2ToYRow_C;
+ YUY2ToUVRow = YUY2ToUVRow_C;
+#if defined(HAS_YUY2TOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ if (width > 16) {
+ YUY2ToUVRow = YUY2ToUVRow_Any_SSE2;
+ YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToUVRow = YUY2ToUVRow_Unaligned_SSE2;
+ YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
+ if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
+ YUY2ToUVRow = YUY2ToUVRow_SSE2;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ YUY2ToYRow = YUY2ToYRow_SSE2;
+ }
+ }
+ }
}
+#elif defined(HAS_YUY2TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ if (width > 8) {
+ YUY2ToYRow = YUY2ToYRow_Any_NEON;
+ if (width > 16) {
+ YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
+ }
+ }
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToYRow = YUY2ToYRow_NEON;
+ YUY2ToUVRow = YUY2ToUVRow_NEON;
+ }
+ }
+#endif
- // Bi-Planar: Y plane followed by an interlaced U and V plane
- const uint8* interlacedSrc = src_uv;
- uint16* out = (uint16*)(src_y) + dst_stride_frame * (height - 1);
- uint16* out2 = out - dst_stride_frame;
- int32 tmp_r, tmp_g, tmp_b;
- const uint8 *y1,*y2;
- y1 = src_y;
- y2 = y1 + src_stride_y;
- int h, w;
-
- for (h = ((height + 1) >> 1); h > 0; h--) {
- // 2 rows at a time, 2 y's at a time
- for (w = 0; w < ((width + 1) >> 1); w++) {
- // Vertical and horizontal sub-sampling
- // 1. Convert to RGB888
- // 2. Shift to adequate location (in the 16 bit word) - RGB 565
-
- tmp_r = (int32)((mapYc[y1[0]] + mapVcr[interlacedSrc[1]] + 128) >> 8);
- tmp_g = (int32)((mapYc[y1[0]] + mapUcg[interlacedSrc[0]]
- + mapVcg[interlacedSrc[1]] + 128) >> 8);
- tmp_b = (int32)((mapYc[y1[0]] + mapUcb[interlacedSrc[0]] + 128) >> 8);
- out[0] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g)
- & 0xfc) << 3) + (Clip(tmp_b) >> 3);
-
- tmp_r = (int32)((mapYc[y1[1]] + mapVcr[interlacedSrc[1]] + 128) >> 8);
- tmp_g = (int32)((mapYc[y1[1]] + mapUcg[interlacedSrc[0]]
- + mapVcg[interlacedSrc[1]] + 128) >> 8);
- tmp_b = (int32)((mapYc[y1[1]] + mapUcb[interlacedSrc[0]] + 128) >> 8);
- out[1] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g)
- & 0xfc) << 3) + (Clip(tmp_b ) >> 3);
-
- tmp_r = (int32)((mapYc[y2[0]] + mapVcr[interlacedSrc[1]] + 128) >> 8);
- tmp_g = (int32)((mapYc[y2[0]] + mapUcg[interlacedSrc[0]]
- + mapVcg[interlacedSrc[1]] + 128) >> 8);
- tmp_b = (int32)((mapYc[y2[0]] + mapUcb[interlacedSrc[0]] + 128) >> 8);
- out2[0] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g)
- & 0xfc) << 3) + (Clip(tmp_b) >> 3);
-
- tmp_r = (int32)((mapYc[y2[1]] + mapVcr[interlacedSrc[1]]
- + 128) >> 8);
- tmp_g = (int32)((mapYc[y2[1]] + mapUcg[interlacedSrc[0]]
- + mapVcg[interlacedSrc[1]] + 128) >> 8);
- tmp_b = (int32)((mapYc[y2[1]] + mapUcb[interlacedSrc[0]] + 128) >> 8);
- out2[1] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g)
- & 0xfc) << 3) + (Clip(tmp_b) >> 3);
-
- y1 += 2;
- y2 += 2;
- out += 2;
- out2 += 2;
- interlacedSrc += 2;
- }
- y1 += 2 * src_stride_y - width;
- y2 += 2 * src_stride_y - width;
- interlacedSrc += src_stride_uv - ((width + 1) >> 1);
- out -= 3 * dst_stride_frame + dst_stride_frame - width;
- out2 -= 3 * dst_stride_frame + dst_stride_frame - width;
+ for (int y = 0; y < height - 1; y += 2) {
+ YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
+ YUY2ToYRow(src_yuy2, dst_y, width);
+ YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width);
+ src_yuy2 += src_stride_yuy2 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width);
+ YUY2ToYRow(src_yuy2, dst_y, width);
}
return 0;
}
-// TODO(fbarchard): Deprecated - this is same as BG24ToARGB with -height
-int RGB24ToARGB(const uint8* src_frame, int src_stride_frame,
- uint8* dst_frame, int dst_stride_frame,
- int width, int height) {
- if (src_frame == NULL || dst_frame == NULL) {
- return -1;
+// Convert UYVY to I420.
+LIBYUV_API
+int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+ src_stride_uyvy = -src_stride_uyvy;
}
+ void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+ void (*UYVYToYRow)(const uint8* src_uyvy,
+ uint8* dst_y, int pix);
+ UYVYToYRow = UYVYToYRow_C;
+ UYVYToUVRow = UYVYToUVRow_C;
+#if defined(HAS_UYVYTOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ if (width > 16) {
+ UYVYToUVRow = UYVYToUVRow_Any_SSE2;
+ UYVYToYRow = UYVYToYRow_Any_SSE2;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToUVRow = UYVYToUVRow_Unaligned_SSE2;
+ UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
+ if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
+ UYVYToUVRow = UYVYToUVRow_SSE2;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ UYVYToYRow = UYVYToYRow_SSE2;
+ }
+ }
+ }
+ }
+#elif defined(HAS_UYVYTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ if (width > 8) {
+ UYVYToYRow = UYVYToYRow_Any_NEON;
+ if (width > 16) {
+ UYVYToUVRow = UYVYToUVRow_Any_NEON;
+ }
+ }
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToYRow = UYVYToYRow_NEON;
+ UYVYToUVRow = UYVYToUVRow_NEON;
+ }
+ }
+#endif
- int i, j, offset;
- uint8* outFrame = dst_frame;
- const uint8* inFrame = src_frame;
+ for (int y = 0; y < height - 1; y += 2) {
+ UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
+ UYVYToYRow(src_uyvy, dst_y, width);
+ UYVYToYRow(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width);
+ src_uyvy += src_stride_uyvy * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width);
+ UYVYToYRow(src_uyvy, dst_y, width);
+ }
+ return 0;
+}
- outFrame += dst_stride_frame * (height - 1) * 4;
- for (i = 0; i < height; i++) {
- for (j = 0; j < width; j++) {
- offset = j * 4;
- outFrame[0 + offset] = inFrame[0];
- outFrame[1 + offset] = inFrame[1];
- outFrame[2 + offset] = inFrame[2];
- outFrame[3 + offset] = 0xff;
- inFrame += 3;
+// Visual C x86 or GCC little endian.
+#if defined(__x86_64__) || defined(_M_X64) || \
+ defined(__i386__) || defined(_M_IX86) || \
+ defined(__arm__) || defined(_M_ARM) || \
+ (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#define LIBYUV_LITTLE_ENDIAN
+#endif
+
+#ifdef LIBYUV_LITTLE_ENDIAN
+#define READWORD(p) (*reinterpret_cast<const uint32*>(p))
+#else
+static inline uint32 READWORD(const uint8* p) {
+ return static_cast<uint32>(p[0]) |
+ (static_cast<uint32>(p[1]) << 8) |
+ (static_cast<uint32>(p[2]) << 16) |
+ (static_cast<uint32>(p[3]) << 24);
+}
+#endif
+
+// Must be multiple of 6 pixels. Will over convert to handle remainder.
+// https://developer.apple.com/quicktime/icefloe/dispatch019.html#v210
+static void V210ToUYVYRow_C(const uint8* src_v210, uint8* dst_uyvy, int width) {
+ for (int x = 0; x < width; x += 6) {
+ uint32 w = READWORD(src_v210 + 0);
+ dst_uyvy[0] = (w >> 2) & 0xff;
+ dst_uyvy[1] = (w >> 12) & 0xff;
+ dst_uyvy[2] = (w >> 22) & 0xff;
+
+ w = READWORD(src_v210 + 4);
+ dst_uyvy[3] = (w >> 2) & 0xff;
+ dst_uyvy[4] = (w >> 12) & 0xff;
+ dst_uyvy[5] = (w >> 22) & 0xff;
+
+ w = READWORD(src_v210 + 8);
+ dst_uyvy[6] = (w >> 2) & 0xff;
+ dst_uyvy[7] = (w >> 12) & 0xff;
+ dst_uyvy[8] = (w >> 22) & 0xff;
+
+ w = READWORD(src_v210 + 12);
+ dst_uyvy[9] = (w >> 2) & 0xff;
+ dst_uyvy[10] = (w >> 12) & 0xff;
+ dst_uyvy[11] = (w >> 22) & 0xff;
+
+ src_v210 += 16;
+ dst_uyvy += 12;
+ }
+}
+
+// Convert V210 to I420.
+// V210 is 10 bit version of UYVY. 16 bytes to store 6 pixels.
+// With is multiple of 48.
+LIBYUV_API
+int V210ToI420(const uint8* src_v210, int src_stride_v210,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (width * 2 * 2 > kMaxStride) { // 2 rows of UYVY are required.
+ return -1;
+ } else if (!src_v210 || !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_v210 = src_v210 + (height - 1) * src_stride_v210;
+ src_stride_v210 = -src_stride_v210;
+ }
+ SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+ void (*V210ToUYVYRow)(const uint8* src_v210, uint8* dst_uyvy, int pix);
+ V210ToUYVYRow = V210ToUYVYRow_C;
+
+ void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+ void (*UYVYToYRow)(const uint8* src_uyvy,
+ uint8* dst_y, int pix);
+ UYVYToYRow = UYVYToYRow_C;
+ UYVYToUVRow = UYVYToUVRow_C;
+#if defined(HAS_UYVYTOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
+ UYVYToUVRow = UYVYToUVRow_SSE2;
+ UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ UYVYToYRow = UYVYToYRow_SSE2;
+ }
+ }
+#elif defined(HAS_UYVYTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ if (width > 8) {
+ UYVYToYRow = UYVYToYRow_Any_NEON;
+ if (width > 16) {
+ UYVYToUVRow = UYVYToUVRow_Any_NEON;
+ }
}
- outFrame -= 4 * (dst_stride_frame - width);
- inFrame += src_stride_frame - width;
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToYRow = UYVYToYRow_NEON;
+ UYVYToUVRow = UYVYToUVRow_NEON;
+ }
+ }
+#endif
+
+#if defined(HAS_UYVYTOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ if (width > 16) {
+ UYVYToUVRow = UYVYToUVRow_Any_SSE2;
+ UYVYToYRow = UYVYToYRow_Any_SSE2;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
+ UYVYToUVRow = UYVYToUVRow_SSE2;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ UYVYToYRow = UYVYToYRow_SSE2;
+ }
+ }
+ }
+#elif defined(HAS_UYVYTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ if (width > 8) {
+ UYVYToYRow = UYVYToYRow_Any_NEON;
+ if (width > 16) {
+ UYVYToUVRow = UYVYToUVRow_Any_NEON;
+ }
+ }
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToYRow = UYVYToYRow_NEON;
+ UYVYToUVRow = UYVYToUVRow_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height - 1; y += 2) {
+ V210ToUYVYRow(src_v210, row, width);
+ V210ToUYVYRow(src_v210 + src_stride_v210, row + kMaxStride, width);
+ UYVYToUVRow(row, kMaxStride, dst_u, dst_v, width);
+ UYVYToYRow(row, dst_y, width);
+ UYVYToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
+ src_v210 += src_stride_v210 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ V210ToUYVYRow(src_v210, row, width);
+ UYVYToUVRow(row, 0, dst_u, dst_v, width);
+ UYVYToYRow(row, dst_y, width);
}
return 0;
}
-int ARGBToI420(const uint8* src_frame, int src_stride_frame,
+LIBYUV_API
+int ARGBToI420(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
+ if (!src_argb ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
if (height < 0) {
height = -height;
- src_frame = src_frame + (height - 1) * src_stride_frame;
- src_stride_frame = -src_stride_frame;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
}
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
+
+ ARGBToYRow = ARGBToYRow_C;
+ ARGBToUVRow = ARGBToUVRow_C;
#if defined(HAS_ARGBTOYROW_SSSE3)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
- (width % 16 == 0) &&
- IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
- IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- } else
-#endif
- {
- ARGBToYRow = ARGBToYRow_C;
- }
-#if defined(HAS_ARGBTOUVROW_SSSE3)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
- (width % 16 == 0) &&
- IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
- IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
- IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
- } else
-#endif
- {
- ARGBToUVRow = ARGBToUVRow_C;
- }
-
- for (int y = 0; y < (height - 1); y += 2) {
- ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
- ARGBToYRow(src_frame, dst_y, width);
- ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
- src_frame += src_stride_frame * 2;
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ if (width > 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+ }
+#endif
+
+ for (int y = 0; y < height - 1; y += 2) {
+ ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
+ ARGBToYRow(src_argb, dst_y, width);
+ ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+ src_argb += src_stride_argb * 2;
dst_y += dst_stride_y * 2;
dst_u += dst_stride_u;
dst_v += dst_stride_v;
}
if (height & 1) {
- ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
- ARGBToYRow(src_frame, dst_y, width);
+ ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
+ ARGBToYRow(src_argb, dst_y, width);
}
return 0;
}
-int BGRAToI420(const uint8* src_frame, int src_stride_frame,
+LIBYUV_API
+int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
+ if (!src_bgra ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
if (height < 0) {
height = -height;
- src_frame = src_frame + (height - 1) * src_stride_frame;
- src_stride_frame = -src_stride_frame;
+ src_bgra = src_bgra + (height - 1) * src_stride_bgra;
+ src_stride_bgra = -src_stride_bgra;
}
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int pix);
+ void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra,
uint8* dst_u, uint8* dst_v, int width);
+
+ BGRAToYRow = BGRAToYRow_C;
+ BGRAToUVRow = BGRAToUVRow_C;
#if defined(HAS_BGRATOYROW_SSSE3)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
- (width % 16 == 0) &&
- IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
- IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
- ARGBToYRow = BGRAToYRow_SSSE3;
- } else
-#endif
- {
- ARGBToYRow = BGRAToYRow_C;
- }
-#if defined(HAS_BGRATOUVROW_SSSE3)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
- (width % 16 == 0) &&
- IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
- IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
- IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
- ARGBToUVRow = BGRAToUVRow_SSSE3;
- } else
-#endif
- {
- ARGBToUVRow = BGRAToUVRow_C;
- }
-
- for (int y = 0; y < (height - 1); y += 2) {
- ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
- ARGBToYRow(src_frame, dst_y, width);
- ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
- src_frame += src_stride_frame * 2;
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ if (width > 16) {
+ BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
+ BGRAToYRow = BGRAToYRow_Any_SSSE3;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ BGRAToUVRow = BGRAToUVRow_Unaligned_SSSE3;
+ BGRAToYRow = BGRAToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16)) {
+ BGRAToUVRow = BGRAToUVRow_SSSE3;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ BGRAToYRow = BGRAToYRow_SSSE3;
+ }
+ }
+ }
+ }
+#endif
+
+ for (int y = 0; y < height - 1; y += 2) {
+ BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width);
+ BGRAToYRow(src_bgra, dst_y, width);
+ BGRAToYRow(src_bgra + src_stride_bgra, dst_y + dst_stride_y, width);
+ src_bgra += src_stride_bgra * 2;
dst_y += dst_stride_y * 2;
dst_u += dst_stride_u;
dst_v += dst_stride_v;
}
if (height & 1) {
- ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
- ARGBToYRow(src_frame, dst_y, width);
+ BGRAToUVRow(src_bgra, 0, dst_u, dst_v, width);
+ BGRAToYRow(src_bgra, dst_y, width);
}
return 0;
}
-int ABGRToI420(const uint8* src_frame, int src_stride_frame,
+LIBYUV_API
+int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
+ if (!src_abgr ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
if (height < 0) {
height = -height;
- src_frame = src_frame + (height - 1) * src_stride_frame;
- src_stride_frame = -src_stride_frame;
+ src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+ src_stride_abgr = -src_stride_abgr;
}
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int pix);
+ void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr,
uint8* dst_u, uint8* dst_v, int width);
+
+ ABGRToYRow = ABGRToYRow_C;
+ ABGRToUVRow = ABGRToUVRow_C;
#if defined(HAS_ABGRTOYROW_SSSE3)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
- (width % 16 == 0) &&
- IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
- IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
- ARGBToYRow = ABGRToYRow_SSSE3;
- } else
-#endif
- {
- ARGBToYRow = ABGRToYRow_C;
- }
-#if defined(HAS_ABGRTOUVROW_SSSE3)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
- (width % 16 == 0) &&
- IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
- IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
- IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
- ARGBToUVRow = ABGRToUVRow_SSSE3;
- } else
-#endif
- {
- ARGBToUVRow = ABGRToUVRow_C;
- }
-
- for (int y = 0; y < (height - 1); y += 2) {
- ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
- ARGBToYRow(src_frame, dst_y, width);
- ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
- src_frame += src_stride_frame * 2;
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ if (width > 16) {
+ ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+ ABGRToYRow = ABGRToYRow_Any_SSSE3;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_Unaligned_SSSE3;
+ ABGRToYRow = ABGRToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_abgr, 16) && IS_ALIGNED(src_stride_abgr, 16)) {
+ ABGRToUVRow = ABGRToUVRow_SSSE3;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ABGRToYRow = ABGRToYRow_SSSE3;
+ }
+ }
+ }
+ }
+#endif
+
+ for (int y = 0; y < height - 1; y += 2) {
+ ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
+ ABGRToYRow(src_abgr, dst_y, width);
+ ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
+ src_abgr += src_stride_abgr * 2;
dst_y += dst_stride_y * 2;
dst_u += dst_stride_u;
dst_v += dst_stride_v;
}
if (height & 1) {
- ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
- ARGBToYRow(src_frame, dst_y, width);
+ ABGRToUVRow(src_abgr, 0, dst_u, dst_v, width);
+ ABGRToYRow(src_abgr, dst_y, width);
}
return 0;
}
-int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
+LIBYUV_API
+int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_rgba ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgba = src_rgba + (height - 1) * src_stride_rgba;
+ src_stride_rgba = -src_stride_rgba;
+ }
+ void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int pix);
+ void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba,
+ uint8* dst_u, uint8* dst_v, int width);
+
+ RGBAToYRow = RGBAToYRow_C;
+ RGBAToUVRow = RGBAToUVRow_C;
+#if defined(HAS_RGBATOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ if (width > 16) {
+ RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
+ RGBAToYRow = RGBAToYRow_Any_SSSE3;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToUVRow = RGBAToUVRow_Unaligned_SSSE3;
+ RGBAToYRow = RGBAToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_rgba, 16) && IS_ALIGNED(src_stride_rgba, 16)) {
+ RGBAToUVRow = RGBAToUVRow_SSSE3;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ RGBAToYRow = RGBAToYRow_SSSE3;
+ }
+ }
+ }
+ }
+#endif
+
+ for (int y = 0; y < height - 1; y += 2) {
+ RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);
+ RGBAToYRow(src_rgba, dst_y, width);
+ RGBAToYRow(src_rgba + src_stride_rgba, dst_y + dst_stride_y, width);
+ src_rgba += src_stride_rgba * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ RGBAToUVRow(src_rgba, 0, dst_u, dst_v, width);
+ RGBAToYRow(src_rgba, dst_y, width);
+ }
+ return 0;
+}
+
+LIBYUV_API
+int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
+ if (width * 4 > kMaxStride) { // Row buffer is required.
+ return -1;
+ } else if (!src_rgb24 ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
if (height < 0) {
height = -height;
- src_frame = src_frame + (height - 1) * src_stride_frame;
- src_stride_frame = -src_stride_frame;
+ src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+ src_stride_rgb24 = -src_stride_rgb24;
}
+ SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+ void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix);
+
+ RGB24ToARGBRow = RGB24ToARGBRow_C;
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) &&
+ TestReadSafe(src_rgb24, src_stride_rgb24, width, height, 3, 48)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+ }
+#endif
+
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
-#if defined(HAS_RGB24TOYROW_SSSE3)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
- (width % 16 == 0) &&
- IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
- IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
- ARGBToYRow = RGB24ToYRow_SSSE3;
- } else
-#endif
- {
- ARGBToYRow = RGB24ToYRow_C;
- }
-#if defined(HAS_RGB24TOUVROW_SSSE3)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
- (width % 16 == 0) &&
- IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
- IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
- IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
- ARGBToUVRow = RGB24ToUVRow_SSSE3;
- } else
-#endif
- {
- ARGBToUVRow = RGB24ToUVRow_C;
- }
-
- for (int y = 0; y < (height - 1); y += 2) {
- ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
- ARGBToYRow(src_frame, dst_y, width);
- ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
- src_frame += src_stride_frame * 2;
+
+ ARGBToYRow = ARGBToYRow_C;
+ ARGBToUVRow = ARGBToUVRow_C;
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ if (width > 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ }
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+#endif
+
+ for (int y = 0; y < height - 1; y += 2) {
+ RGB24ToARGBRow(src_rgb24, row, width);
+ RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kMaxStride, width);
+ ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
+ src_rgb24 += src_stride_rgb24 * 2;
dst_y += dst_stride_y * 2;
dst_u += dst_stride_u;
dst_v += dst_stride_v;
}
if (height & 1) {
- ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
- ARGBToYRow(src_frame, dst_y, width);
+ RGB24ToARGBRow_C(src_rgb24, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
}
return 0;
}
-int RAWToI420(const uint8* src_frame, int src_stride_frame,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
+LIBYUV_API
+int RAWToI420(const uint8* src_raw, int src_stride_raw,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (width * 4 > kMaxStride) { // Row buffer is required.
+ return -1;
+ } else if (!src_raw ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
if (height < 0) {
height = -height;
- src_frame = src_frame + (height - 1) * src_stride_frame;
- src_stride_frame = -src_stride_frame;
+ src_raw = src_raw + (height - 1) * src_stride_raw;
+ src_stride_raw = -src_stride_raw;
}
+ SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+ void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix);
+
+ RAWToARGBRow = RAWToARGBRow_C;
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) &&
+ TestReadSafe(src_raw, src_stride_raw, width, height, 3, 48)) {
+ RAWToARGBRow = RAWToARGBRow_SSSE3;
+ }
+#endif
+
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
-#if defined(HAS_RAWTOYROW_SSSE3)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
- (width % 16 == 0) &&
- IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
- IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
- ARGBToYRow = RAWToYRow_SSSE3;
- } else
-#endif
- {
- ARGBToYRow = RAWToYRow_C;
- }
-#if defined(HAS_RAWTOUVROW_SSSE3)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
- (width % 16 == 0) &&
- IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
- IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
- IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
- ARGBToUVRow = RAWToUVRow_SSSE3;
- } else
-#endif
- {
- ARGBToUVRow = RAWToUVRow_C;
- }
-
- for (int y = 0; y < (height - 1); y += 2) {
- ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
- ARGBToYRow(src_frame, dst_y, width);
- ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
- src_frame += src_stride_frame * 2;
+
+ ARGBToYRow = ARGBToYRow_C;
+ ARGBToUVRow = ARGBToUVRow_C;
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ if (width > 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ }
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+#endif
+
+ for (int y = 0; y < height - 1; y += 2) {
+ RAWToARGBRow(src_raw, row, width);
+ RAWToARGBRow(src_raw + src_stride_raw, row + kMaxStride, width);
+ ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
+ src_raw += src_stride_raw * 2;
dst_y += dst_stride_y * 2;
dst_u += dst_stride_u;
dst_v += dst_stride_v;
}
if (height & 1) {
- ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
- ARGBToYRow(src_frame, dst_y, width);
+ RAWToARGBRow_C(src_raw, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
}
return 0;
}
-} // namespace libyuv
+LIBYUV_API
+int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (width * 4 > kMaxStride) { // Row buffer is required.
+ return -1;
+ } else if (!src_rgb565 ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
+ src_stride_rgb565 = -src_stride_rgb565;
+ }
+ SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+ void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix);
+
+ RGB565ToARGBRow = RGB565ToARGBRow_C;
+#if defined(HAS_RGB565TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ TestReadSafe(src_rgb565, src_stride_rgb565, width, height, 2, 16)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+ }
+#endif
+
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+
+ ARGBToYRow = ARGBToYRow_C;
+ ARGBToUVRow = ARGBToUVRow_C;
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ if (width > 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ }
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+#endif
+
+ for (int y = 0; y < height - 1; y += 2) {
+ RGB565ToARGBRow(src_rgb565, row, width);
+ RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kMaxStride, width);
+ ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
+ src_rgb565 += src_stride_rgb565 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ RGB565ToARGBRow_C(src_rgb565, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ }
+ return 0;
+}
+
+LIBYUV_API
+int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (width * 4 > kMaxStride) { // Row buffer is required.
+ return -1;
+ } else if (!src_argb1555 ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
+ src_stride_argb1555 = -src_stride_argb1555;
+ }
+ SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+ void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix);
+
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_C;
+#if defined(HAS_ARGB1555TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ TestReadSafe(src_argb1555, src_stride_argb1555, width, height, 2, 16)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+ }
+#endif
+
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+
+ ARGBToYRow = ARGBToYRow_C;
+ ARGBToUVRow = ARGBToUVRow_C;
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ if (width > 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ }
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+#endif
+
+ for (int y = 0; y < height - 1; y += 2) {
+ ARGB1555ToARGBRow(src_argb1555, row, width);
+ ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555,
+ row + kMaxStride, width);
+ ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
+ src_argb1555 += src_stride_argb1555 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ ARGB1555ToARGBRow_C(src_argb1555, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ }
+ return 0;
+}
+
+LIBYUV_API
+int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (width * 4 > kMaxStride) { // Row buffer is required.
+ return -1;
+ } else if (!src_argb4444 ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
+ src_stride_argb4444 = -src_stride_argb4444;
+ }
+ SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+ void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix);
+
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_C;
+#if defined(HAS_ARGB4444TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ TestReadSafe(src_argb4444, src_stride_argb4444, width, height, 2, 16)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+ }
+#endif
+
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+
+ ARGBToYRow = ARGBToYRow_C;
+ ARGBToUVRow = ARGBToUVRow_C;
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ if (width > 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ }
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+#endif
+
+ for (int y = 0; y < height - 1; y += 2) {
+ ARGB4444ToARGBRow(src_argb4444, row, width);
+ ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444,
+ row + kMaxStride, width);
+ ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
+ src_argb4444 += src_stride_argb4444 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ ARGB4444ToARGBRow_C(src_argb4444, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ }
+ return 0;
+}
+
+#ifdef HAVE_JPEG
+struct I420Buffers {
+ uint8* y;
+ int y_stride;
+ uint8* u;
+ int u_stride;
+ uint8* v;
+ int v_stride;
+ int w;
+ int h;
+};
+
+static void JpegCopyI420(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ I420Buffers* dest = static_cast<I420Buffers*>(opaque);
+ I420Copy(data[0], strides[0],
+ data[1], strides[1],
+ data[2], strides[2],
+ dest->y, dest->y_stride,
+ dest->u, dest->u_stride,
+ dest->v, dest->v_stride,
+ dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->u += ((rows + 1) >> 1) * dest->u_stride;
+ dest->v += ((rows + 1) >> 1) * dest->v_stride;
+ dest->h -= rows;
+}
+
+static void JpegI422ToI420(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ I420Buffers* dest = static_cast<I420Buffers*>(opaque);
+ I422ToI420(data[0], strides[0],
+ data[1], strides[1],
+ data[2], strides[2],
+ dest->y, dest->y_stride,
+ dest->u, dest->u_stride,
+ dest->v, dest->v_stride,
+ dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->u += ((rows + 1) >> 1) * dest->u_stride;
+ dest->v += ((rows + 1) >> 1) * dest->v_stride;
+ dest->h -= rows;
+}
+
+static void JpegI444ToI420(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ I420Buffers* dest = static_cast<I420Buffers*>(opaque);
+ I444ToI420(data[0], strides[0],
+ data[1], strides[1],
+ data[2], strides[2],
+ dest->y, dest->y_stride,
+ dest->u, dest->u_stride,
+ dest->v, dest->v_stride,
+ dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->u += ((rows + 1) >> 1) * dest->u_stride;
+ dest->v += ((rows + 1) >> 1) * dest->v_stride;
+ dest->h -= rows;
+}
+
+static void JpegI411ToI420(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ I420Buffers* dest = static_cast<I420Buffers*>(opaque);
+ I411ToI420(data[0], strides[0],
+ data[1], strides[1],
+ data[2], strides[2],
+ dest->y, dest->y_stride,
+ dest->u, dest->u_stride,
+ dest->v, dest->v_stride,
+ dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->u += ((rows + 1) >> 1) * dest->u_stride;
+ dest->v += ((rows + 1) >> 1) * dest->v_stride;
+ dest->h -= rows;
+}
+
+static void JpegI400ToI420(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ I420Buffers* dest = static_cast<I420Buffers*>(opaque);
+ I400ToI420(data[0], strides[0],
+ dest->y, dest->y_stride,
+ dest->u, dest->u_stride,
+ dest->v, dest->v_stride,
+ dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->u += ((rows + 1) >> 1) * dest->u_stride;
+ dest->v += ((rows + 1) >> 1) * dest->v_stride;
+ dest->h -= rows;
+}
+
+// MJPG (Motion JPeg) to I420
+// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
+LIBYUV_API
+int MJPGToI420(const uint8* sample,
+ size_t sample_size,
+ uint8* y, int y_stride,
+ uint8* u, int u_stride,
+ uint8* v, int v_stride,
+ int w, int h,
+ int dw, int dh) {
+ if (sample_size == kUnknownDataSize) {
+ // ERROR: MJPEG frame size unknown
+ return -1;
+ }
+
+ // TODO(fbarchard): Port to C
+ MJpegDecoder mjpeg_decoder;
+ bool ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+ if (ret && (mjpeg_decoder.GetWidth() != w ||
+ mjpeg_decoder.GetHeight() != h)) {
+ // ERROR: MJPEG frame has unexpected dimensions
+ mjpeg_decoder.UnloadFrame();
+ return 1; // runtime failure
+ }
+ if (ret) {
+ I420Buffers bufs = { y, y_stride, u, u_stride, v, v_stride, dw, dh };
+ // YUV420
+ if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh);
+ // YUV422
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh);
+ // YUV444
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh);
+ // YUV411
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToI420, &bufs, dw, dh);
+ // YUV400
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceGrayscale &&
+ mjpeg_decoder.GetNumComponents() == 1 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh);
+ } else {
+ // TODO(fbarchard): Implement conversion for any other colorspace/sample
+ // factors that occur in practice. 411 is supported by libjpeg
+ // ERROR: Unable to convert MJPEG frame because format is not supported
+ mjpeg_decoder.UnloadFrame();
+ return 1;
+ }
+ }
+ return 0;
+}
+#endif
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// src_width is used for source stride computation
+// src_height is used to compute location of planes, and indicate inversion
+// sample_size is measured in bytes and is the size of the frame.
+// With MJPEG it is the compressed size of the frame.
+LIBYUV_API
+int ConvertToI420(const uint8* sample,
+#ifdef HAVE_JPEG
+ size_t sample_size,
+#else
+ size_t /* sample_size */,
+#endif
+ uint8* y, int y_stride,
+ uint8* u, int u_stride,
+ uint8* v, int v_stride,
+ int crop_x, int crop_y,
+ int src_width, int src_height,
+ int dst_width, int dst_height,
+ RotationMode rotation,
+ uint32 format) {
+ if (!y || !u || !v || !sample ||
+ src_width <= 0 || dst_width <= 0 ||
+ src_height == 0 || dst_height == 0) {
+ return -1;
+ }
+ int aligned_src_width = (src_width + 1) & ~1;
+ const uint8* src;
+ const uint8* src_uv;
+ int abs_src_height = (src_height < 0) ? -src_height : src_height;
+ int inv_dst_height = (dst_height < 0) ? -dst_height : dst_height;
+ if (src_height < 0) {
+ inv_dst_height = -inv_dst_height;
+ }
+ int r = 0;
+
+ // One pass rotation is available for some formats. For the rest, convert
+ // to I420 (with optional vertical flipping) into a temporary I420 buffer,
+ // and then rotate the I420 to the final destination buffer.
+ // For in-place conversion, if destination y is same as source sample,
+ // also enable temporary buffer.
+ bool need_buf = (rotation && format != FOURCC_I420 &&
+ format != FOURCC_NV12 && format != FOURCC_NV21 &&
+ format != FOURCC_YU12 && format != FOURCC_YV12) || y == sample;
+ uint8* tmp_y = y;
+ uint8* tmp_u = u;
+ uint8* tmp_v = v;
+ int tmp_y_stride = y_stride;
+ int tmp_u_stride = u_stride;
+ int tmp_v_stride = v_stride;
+ uint8* buf = NULL;
+ int abs_dst_height = (dst_height < 0) ? -dst_height : dst_height;
+ if (need_buf) {
+ int y_size = dst_width * abs_dst_height;
+ int uv_size = ((dst_width + 1) / 2) * ((abs_dst_height + 1) / 2);
+ buf = new uint8[y_size + uv_size * 2];
+ if (!buf) {
+ return 1; // Out of memory runtime error.
+ }
+ y = buf;
+ u = y + y_size;
+ v = u + uv_size;
+ y_stride = dst_width;
+ u_stride = v_stride = ((dst_width + 1) / 2);
+ }
+
+ switch (format) {
+ // Single plane formats
+ case FOURCC_YUY2:
+ src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+ r = YUY2ToI420(src, aligned_src_width * 2,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_UYVY:
+ src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+ r = UYVYToI420(src, aligned_src_width * 2,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_V210:
+ // stride is multiple of 48 pixels (128 bytes).
+ // pixels come in groups of 6 = 16 bytes
+ src = sample + (aligned_src_width + 47) / 48 * 128 * crop_y +
+ crop_x / 6 * 16;
+ r = V210ToI420(src, (aligned_src_width + 47) / 48 * 128,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_24BG:
+ src = sample + (src_width * crop_y + crop_x) * 3;
+ r = RGB24ToI420(src, src_width * 3,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_RAW:
+ src = sample + (src_width * crop_y + crop_x) * 3;
+ r = RAWToI420(src, src_width * 3,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_ARGB:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = ARGBToI420(src, src_width * 4,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_BGRA:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = BGRAToI420(src, src_width * 4,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_ABGR:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = ABGRToI420(src, src_width * 4,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_RGBA:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = RGBAToI420(src, src_width * 4,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_RGBP:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = RGB565ToI420(src, src_width * 2,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_RGBO:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = ARGB1555ToI420(src, src_width * 2,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_R444:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = ARGB4444ToI420(src, src_width * 2,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ // TODO(fbarchard): Support cropping Bayer by odd numbers
+ // by adjusting fourcc.
+ case FOURCC_BGGR:
+ src = sample + (src_width * crop_y + crop_x);
+ r = BayerBGGRToI420(src, src_width,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+
+ case FOURCC_GBRG:
+ src = sample + (src_width * crop_y + crop_x);
+ r = BayerGBRGToI420(src, src_width,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+
+ case FOURCC_GRBG:
+ src = sample + (src_width * crop_y + crop_x);
+ r = BayerGRBGToI420(src, src_width,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+
+ case FOURCC_RGGB:
+ src = sample + (src_width * crop_y + crop_x);
+ r = BayerRGGBToI420(src, src_width,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+
+ case FOURCC_I400:
+ src = sample + src_width * crop_y + crop_x;
+ r = I400ToI420(src, src_width,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+
+ // Biplanar formats
+ case FOURCC_NV12:
+ src = sample + (src_width * crop_y + crop_x);
+ src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+ r = NV12ToI420Rotate(src, src_width,
+ src_uv, aligned_src_width,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height, rotation);
+ break;
+ case FOURCC_NV21:
+ src = sample + (src_width * crop_y + crop_x);
+ src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+ // Call NV12 but with u and v parameters swapped.
+ r = NV12ToI420Rotate(src, src_width,
+ src_uv, aligned_src_width,
+ y, y_stride,
+ v, v_stride,
+ u, u_stride,
+ dst_width, inv_dst_height, rotation);
+ break;
+ case FOURCC_M420:
+ src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
+ r = M420ToI420(src, src_width,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_Q420:
+ src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x;
+ src_uv = sample + (src_width + aligned_src_width * 2) * crop_y +
+ src_width + crop_x * 2;
+ r = Q420ToI420(src, src_width * 3,
+ src_uv, src_width * 3,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ // Triplanar formats
+ case FOURCC_I420:
+ case FOURCC_YU12:
+ case FOURCC_YV12: {
+ const uint8* src_y = sample + (src_width * crop_y + crop_x);
+ const uint8* src_u;
+ const uint8* src_v;
+ int halfwidth = (src_width + 1) / 2;
+ int halfheight = (abs_src_height + 1) / 2;
+ if (format == FOURCC_YV12) {
+ src_v = sample + src_width * abs_src_height +
+ (halfwidth * crop_y + crop_x) / 2;
+ src_u = sample + src_width * abs_src_height +
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ } else {
+ src_u = sample + src_width * abs_src_height +
+ (halfwidth * crop_y + crop_x) / 2;
+ src_v = sample + src_width * abs_src_height +
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ }
+ r = I420Rotate(src_y, src_width,
+ src_u, halfwidth,
+ src_v, halfwidth,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height, rotation);
+ break;
+ }
+ case FOURCC_I422:
+ case FOURCC_YV16: {
+ const uint8* src_y = sample + src_width * crop_y + crop_x;
+ const uint8* src_u;
+ const uint8* src_v;
+ int halfwidth = (src_width + 1) / 2;
+ if (format == FOURCC_YV16) {
+ src_v = sample + src_width * abs_src_height +
+ halfwidth * crop_y + crop_x / 2;
+ src_u = sample + src_width * abs_src_height +
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ } else {
+ src_u = sample + src_width * abs_src_height +
+ halfwidth * crop_y + crop_x / 2;
+ src_v = sample + src_width * abs_src_height +
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ }
+ r = I422ToI420(src_y, src_width,
+ src_u, halfwidth,
+ src_v, halfwidth,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ }
+ case FOURCC_I444:
+ case FOURCC_YV24: {
+ const uint8* src_y = sample + src_width * crop_y + crop_x;
+ const uint8* src_u;
+ const uint8* src_v;
+ if (format == FOURCC_YV24) {
+ src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ } else {
+ src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ }
+ r = I444ToI420(src_y, src_width,
+ src_u, src_width,
+ src_v, src_width,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ }
+ case FOURCC_I411: {
+ int quarterwidth = (src_width + 3) / 4;
+ const uint8* src_y = sample + src_width * crop_y + crop_x;
+ const uint8* src_u = sample + src_width * abs_src_height +
+ quarterwidth * crop_y + crop_x / 4;
+ const uint8* src_v = sample + src_width * abs_src_height +
+ quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
+ r = I411ToI420(src_y, src_width,
+ src_u, quarterwidth,
+ src_v, quarterwidth,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ }
+#ifdef HAVE_JPEG
+ case FOURCC_MJPG:
+ r = MJPGToI420(sample, sample_size,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ src_width, abs_src_height, dst_width, inv_dst_height);
+ break;
+#endif
+ default:
+ r = -1; // unknown fourcc - return failure code.
+ }
+
+ if (need_buf) {
+ if (!r) {
+ r = I420Rotate(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ tmp_y, tmp_y_stride,
+ tmp_u, tmp_u_stride,
+ tmp_v, tmp_v_stride,
+ dst_width, abs_dst_height, rotation);
+ }
+ delete buf;
+ }
+
+ return r;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/convert_argb.cc b/files/source/convert_argb.cc
new file mode 100644
index 00000000..1c5aa9d9
--- /dev/null
+++ b/files/source/convert_argb.cc
@@ -0,0 +1,1300 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_argb.h"
+
+#include <string.h> // for memset()
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/format_conversion.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/rotate_argb.h"
+#include "libyuv/video_common.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy ARGB with optional flipping
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_argb || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+
+ CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width * 4, height);
+ return 0;
+}
+
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ void (*I444ToARGBRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I444ToARGBRow_C;
+#if defined(HAS_I444TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ I444ToARGBRow = I444ToARGBRow_SSSE3;
+ }
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ I444ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I422 to ARGB.
+LIBYUV_API
+int I422ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ void (*I422ToARGBRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGBRow = I422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGBRow = I422ToARGBRow_NEON;
+ }
+ }
+#elif defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
+ }
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I411 to ARGB.
+LIBYUV_API
+int I411ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ void (*I411ToARGBRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I411ToARGBRow_C;
+#if defined(HAS_I411TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I411ToARGBRow = I411ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I411ToARGBRow = I411ToARGBRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ I411ToARGBRow = I411ToARGBRow_SSSE3;
+ }
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ I411ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+
+// Convert I400 to ARGB.
+LIBYUV_API
+int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_y || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ void (*YToARGBRow)(const uint8* y_buf,
+ uint8* rgb_buf,
+ int width) = YToARGBRow_C;
+#if defined(HAS_YTOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(width, 8) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ YToARGBRow = YToARGBRow_SSE2;
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ YToARGBRow(src_y, dst_argb, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ }
+ return 0;
+}
+
+// Convert I400 to ARGB.
+LIBYUV_API
+int I400ToARGB(const uint8* src_y, int src_stride_y,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_y || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) =
+ I400ToARGBRow_C;
+#if defined(HAS_I400TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(width, 8) &&
+ IS_ALIGNED(src_y, 8) && IS_ALIGNED(src_stride_y, 8) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ I400ToARGBRow = I400ToARGBRow_SSE2;
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ I400ToARGBRow(src_y, dst_argb, width);
+ src_y += src_stride_y;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert BGRA to ARGB.
+LIBYUV_API
+int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_bgra || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_bgra = src_bgra + (height - 1) * src_stride_bgra;
+ src_stride_bgra = -src_stride_bgra;
+ }
+ void (*BGRAToARGBRow)(const uint8* src_bgra, uint8* dst_argb, int pix) =
+ BGRAToARGBRow_C;
+#if defined(HAS_BGRATOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) &&
+ IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ BGRAToARGBRow = BGRAToARGBRow_SSSE3;
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ BGRAToARGBRow(src_bgra, dst_argb, width);
+ src_bgra += src_stride_bgra;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert ABGR to ARGB.
+LIBYUV_API
+int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_abgr || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+ src_stride_abgr = -src_stride_abgr;
+ }
+ void (*ABGRToARGBRow)(const uint8* src_abgr, uint8* dst_argb, int pix) =
+ ABGRToARGBRow_C;
+#if defined(HAS_ABGRTOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) &&
+ IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_abgr, 16) && IS_ALIGNED(src_stride_abgr, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ABGRToARGBRow = ABGRToARGBRow_SSSE3;
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ ABGRToARGBRow(src_abgr, dst_argb, width);
+ src_abgr += src_stride_abgr;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert RGBA to ARGB.
+LIBYUV_API
+int RGBAToARGB(const uint8* src_rgba, int src_stride_rgba,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_rgba || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgba = src_rgba + (height - 1) * src_stride_rgba;
+ src_stride_rgba = -src_stride_rgba;
+ }
+ void (*RGBAToARGBRow)(const uint8* src_rgba, uint8* dst_argb, int pix) =
+ RGBAToARGBRow_C;
+#if defined(HAS_RGBATOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) &&
+ IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_rgba, 16) && IS_ALIGNED(src_stride_rgba, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ RGBAToARGBRow = RGBAToARGBRow_SSSE3;
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ RGBAToARGBRow(src_rgba, dst_argb, width);
+ src_rgba += src_stride_rgba;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert RAW to ARGB.
+LIBYUV_API
+int RAWToARGB(const uint8* src_raw, int src_stride_raw,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_raw || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_raw = src_raw + (height - 1) * src_stride_raw;
+ src_stride_raw = -src_stride_raw;
+ }
+ void (*RAWToARGBRow)(const uint8* src_raw, uint8* dst_argb, int pix) =
+ RAWToARGBRow_C;
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) &&
+ IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ RAWToARGBRow = RAWToARGBRow_SSSE3;
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ RAWToARGBRow(src_raw, dst_argb, width);
+ src_raw += src_stride_raw;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert RGB24 to ARGB.
+LIBYUV_API
+int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_rgb24 || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+ src_stride_rgb24 = -src_stride_rgb24;
+ }
+ void (*RGB24ToARGBRow)(const uint8* src_rgb24, uint8* dst_argb, int pix) =
+ RGB24ToARGBRow_C;
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) &&
+ IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ RGB24ToARGBRow(src_rgb24, dst_argb, width);
+ src_rgb24 += src_stride_rgb24;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert RGB565 to ARGB.
+LIBYUV_API
+int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_rgb565 || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
+ src_stride_rgb565 = -src_stride_rgb565;
+ }
+ void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int pix) =
+ RGB565ToARGBRow_C;
+#if defined(HAS_RGB565TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(width, 8) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ RGB565ToARGBRow(src_rgb565, dst_argb, width);
+ src_rgb565 += src_stride_rgb565;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert ARGB1555 to ARGB.
+LIBYUV_API
+int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_argb1555 || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
+ src_stride_argb1555 = -src_stride_argb1555;
+ }
+ void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb,
+ int pix) = ARGB1555ToARGBRow_C;
+#if defined(HAS_ARGB1555TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(width, 8) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ ARGB1555ToARGBRow(src_argb1555, dst_argb, width);
+ src_argb1555 += src_stride_argb1555;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert ARGB4444 to ARGB.
+LIBYUV_API
+int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_argb4444 || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
+ src_stride_argb4444 = -src_stride_argb4444;
+ }
+ void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb,
+ int pix) = ARGB4444ToARGBRow_C;
+#if defined(HAS_ARGB4444TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(width, 8) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ ARGB4444ToARGBRow(src_argb4444, dst_argb, width);
+ src_argb4444 += src_stride_argb4444;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert NV12 to ARGB.
+LIBYUV_API
+int NV12ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_uv, int src_stride_uv,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_y || !src_uv || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ void (*NV12ToARGBRow)(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* rgb_buf,
+ int width) = NV12ToARGBRow_C;
+#if defined(HAS_NV12TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_NV12TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToARGBRow = NV12ToARGBRow_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ NV12ToARGBRow(src_y, src_uv, dst_argb, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_uv += src_stride_uv;
+ }
+ }
+ return 0;
+}
+
+// Convert NV21 to ARGB.
+LIBYUV_API
+int NV21ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_uv, int src_stride_uv,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_y || !src_uv || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ void (*NV21ToARGBRow)(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* rgb_buf,
+ int width) = NV21ToARGBRow_C;
+#if defined(HAS_NV21TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ NV21ToARGBRow = NV21ToARGBRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ NV21ToARGBRow = NV21ToARGBRow_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_NV21TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ NV21ToARGBRow = NV21ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ NV21ToARGBRow = NV21ToARGBRow_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ NV21ToARGBRow(src_y, src_uv, dst_argb, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_uv += src_stride_uv;
+ }
+ }
+ return 0;
+}
+
+// Convert M420 to ARGB.
+LIBYUV_API
+int M420ToARGB(const uint8* src_m420, int src_stride_m420,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_m420 || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ void (*NV12ToARGBRow)(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* rgb_buf,
+ int width) = NV12ToARGBRow_C;
+#if defined(HAS_NV12TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+ }
+ }
+ }
+#endif
+
+ for (int y = 0; y < height - 1; y += 2) {
+ NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);
+ NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2,
+ dst_argb + dst_stride_argb, width);
+ dst_argb += dst_stride_argb * 2;
+ src_m420 += src_stride_m420 * 3;
+ }
+ if (height & 1) {
+ NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);
+ }
+ return 0;
+}
+
+// Convert YUY2 to ARGB.
+LIBYUV_API
+int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_yuy2 || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+ src_stride_yuy2 = -src_stride_yuy2;
+ }
+ void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+ int pix) = YUY2ToUV422Row_C;
+ void (*YUY2ToYRow)(const uint8* src_yuy2,
+ uint8* dst_y, int pix) = YUY2ToYRow_C;
+#if defined(HAS_YUY2TOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ if (width > 16) {
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
+ YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
+ YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
+ if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
+ YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
+ YUY2ToYRow = YUY2ToYRow_SSE2;
+ }
+ }
+ }
+#elif defined(HAS_YUY2TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ if (width > 8) {
+ YUY2ToYRow = YUY2ToYRow_Any_NEON;
+ if (width > 16) {
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
+ }
+ }
+ if (IS_ALIGNED(width, 8)) {
+ YUY2ToYRow = YUY2ToYRow_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToUV422Row = YUY2ToUV422Row_NEON;
+ }
+ }
+ }
+#endif
+
+ void (*I422ToARGBRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* argb_buf,
+ int width) = I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGBRow = I422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGBRow = I422ToARGBRow_NEON;
+ }
+ }
+#elif defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
+ }
+ }
+#endif
+
+ SIMD_ALIGNED(uint8 rowy[kMaxStride]);
+ SIMD_ALIGNED(uint8 rowu[kMaxStride]);
+ SIMD_ALIGNED(uint8 rowv[kMaxStride]);
+
+ for (int y = 0; y < height; ++y) {
+ YUY2ToUV422Row(src_yuy2, rowu, rowv, width);
+ YUY2ToYRow(src_yuy2, rowy, width);
+ I422ToARGBRow(rowy, rowu, rowv, dst_argb, width);
+ src_yuy2 += src_stride_yuy2;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert UYVY to ARGB.
+LIBYUV_API
+int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_uyvy || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+ src_stride_uyvy = -src_stride_uyvy;
+ }
+ void (*UYVYToUV422Row)(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
+ int pix) = UYVYToUV422Row_C;
+ void (*UYVYToYRow)(const uint8* src_uyvy,
+ uint8* dst_y, int pix) = UYVYToYRow_C;
+#if defined(HAS_UYVYTOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ if (width > 16) {
+ UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
+ UYVYToYRow = UYVYToYRow_Any_SSE2;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2;
+ UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
+ if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
+ UYVYToUV422Row = UYVYToUV422Row_SSE2;
+ UYVYToYRow = UYVYToYRow_SSE2;
+ }
+ }
+ }
+#endif
+ void (*I422ToARGBRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* argb_buf,
+ int width) = I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGBRow = I422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGBRow = I422ToARGBRow_NEON;
+ }
+ }
+#elif defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
+ }
+ }
+#endif
+
+ SIMD_ALIGNED(uint8 rowy[kMaxStride]);
+ SIMD_ALIGNED(uint8 rowu[kMaxStride]);
+ SIMD_ALIGNED(uint8 rowv[kMaxStride]);
+
+ for (int y = 0; y < height; ++y) {
+ UYVYToUV422Row(src_uyvy, rowu, rowv, width);
+ UYVYToYRow(src_uyvy, rowy, width);
+ I422ToARGBRow(rowy, rowu, rowv, dst_argb, width);
+ src_uyvy += src_stride_uyvy;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+#ifdef HAVE_JPEG
+struct ARGBBuffers {
+ uint8* argb;
+ int argb_stride;
+ int w;
+ int h;
+};
+
+static void JpegI420ToARGB(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ ARGBBuffers* dest = static_cast<ARGBBuffers*>(opaque);
+ I420ToARGB(data[0], strides[0],
+ data[1], strides[1],
+ data[2], strides[2],
+ dest->argb, dest->argb_stride,
+ dest->w, rows);
+ dest->argb += rows * dest->argb_stride;
+ dest->h -= rows;
+}
+
+static void JpegI422ToARGB(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ ARGBBuffers* dest = static_cast<ARGBBuffers*>(opaque);
+ I422ToARGB(data[0], strides[0],
+ data[1], strides[1],
+ data[2], strides[2],
+ dest->argb, dest->argb_stride,
+ dest->w, rows);
+ dest->argb += rows * dest->argb_stride;
+ dest->h -= rows;
+}
+
+static void JpegI444ToARGB(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ ARGBBuffers* dest = static_cast<ARGBBuffers*>(opaque);
+ I444ToARGB(data[0], strides[0],
+ data[1], strides[1],
+ data[2], strides[2],
+ dest->argb, dest->argb_stride,
+ dest->w, rows);
+ dest->argb += rows * dest->argb_stride;
+ dest->h -= rows;
+}
+
+static void JpegI411ToARGB(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ ARGBBuffers* dest = static_cast<ARGBBuffers*>(opaque);
+ I411ToARGB(data[0], strides[0],
+ data[1], strides[1],
+ data[2], strides[2],
+ dest->argb, dest->argb_stride,
+ dest->w, rows);
+ dest->argb += rows * dest->argb_stride;
+ dest->h -= rows;
+}
+
+static void JpegI400ToARGB(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ ARGBBuffers* dest = static_cast<ARGBBuffers*>(opaque);
+ I400ToARGB(data[0], strides[0],
+ dest->argb, dest->argb_stride,
+ dest->w, rows);
+ dest->argb += rows * dest->argb_stride;
+ dest->h -= rows;
+}
+
+// MJPG (Motion JPeg) to ARGB
+// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
+LIBYUV_API
+int MJPGToARGB(const uint8* sample,
+ size_t sample_size,
+ uint8* argb, int argb_stride,
+ int w, int h,
+ int dw, int dh) {
+ if (sample_size == kUnknownDataSize) {
+ // ERROR: MJPEG frame size unknown
+ return -1;
+ }
+
+ // TODO(fbarchard): Port to C
+ MJpegDecoder mjpeg_decoder;
+ bool ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+ if (ret && (mjpeg_decoder.GetWidth() != w ||
+ mjpeg_decoder.GetHeight() != h)) {
+ // ERROR: MJPEG frame has unexpected dimensions
+ mjpeg_decoder.UnloadFrame();
+ return 1; // runtime failure
+ }
+ if (ret) {
+ ARGBBuffers bufs = { argb, argb_stride, dw, dh };
+ // YUV420
+ if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh);
+ // YUV422
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh);
+ // YUV444
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh);
+ // YUV411
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToARGB, &bufs, dw, dh);
+ // YUV400
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceGrayscale &&
+ mjpeg_decoder.GetNumComponents() == 1 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh);
+ } else {
+ // TODO(fbarchard): Implement conversion for any other colorspace/sample
+ // factors that occur in practice. 411 is supported by libjpeg
+ // ERROR: Unable to convert MJPEG frame because format is not supported
+ mjpeg_decoder.UnloadFrame();
+ return 1;
+ }
+ }
+ return 0;
+}
+#endif
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// src_width is used for source stride computation
+// src_height is used to compute location of planes, and indicate inversion
+// sample_size is measured in bytes and is the size of the frame.
+// With MJPEG it is the compressed size of the frame.
+LIBYUV_API
+int ConvertToARGB(const uint8* sample, size_t sample_size,
+ uint8* dst_argb, int argb_stride,
+ int crop_x, int crop_y,
+ int src_width, int src_height,
+ int dst_width, int dst_height,
+ RotationMode rotation,
+ uint32 format) {
+ if (dst_argb == NULL || sample == NULL ||
+ src_width <= 0 || dst_width <= 0 ||
+ src_height == 0 || dst_height == 0) {
+ return -1;
+ }
+ int aligned_src_width = (src_width + 1) & ~1;
+ const uint8* src;
+ const uint8* src_uv;
+ int abs_src_height = (src_height < 0) ? -src_height : src_height;
+ int inv_dst_height = (dst_height < 0) ? -dst_height : dst_height;
+ if (src_height < 0) {
+ inv_dst_height = -inv_dst_height;
+ }
+ int r = 0;
+
+ // One pass rotation is available for some formats. For the rest, convert
+ // to I420 (with optional vertical flipping) into a temporary I420 buffer,
+ // and then rotate the I420 to the final destination buffer.
+ // For in-place conversion, if destination dst_argb is same as source sample,
+ // also enable temporary buffer.
+ bool need_buf = (rotation && format != FOURCC_ARGB) || dst_argb == sample;
+ uint8* tmp_argb = dst_argb;
+ int tmp_argb_stride = argb_stride;
+ uint8* buf = NULL;
+ int abs_dst_height = (dst_height < 0) ? -dst_height : dst_height;
+ if (need_buf) {
+ int argb_size = dst_width * abs_dst_height * 4;
+ buf = new uint8[argb_size];
+ if (!buf) {
+ return 1; // Out of memory runtime error.
+ }
+ dst_argb = buf;
+ argb_stride = dst_width;
+ }
+
+ switch (format) {
+ // Single plane formats
+ case FOURCC_YUY2:
+ src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+ r = YUY2ToARGB(src, aligned_src_width * 2,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_UYVY:
+ src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+ r = UYVYToARGB(src, aligned_src_width * 2,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+// case FOURCC_V210:
+ // stride is multiple of 48 pixels (128 bytes).
+ // pixels come in groups of 6 = 16 bytes
+// src = sample + (aligned_src_width + 47) / 48 * 128 * crop_y +
+// crop_x / 6 * 16;
+// r = V210ToARGB(src, (aligned_src_width + 47) / 48 * 128,
+// dst_argb, argb_stride,
+// dst_width, inv_dst_height);
+// break;
+ case FOURCC_24BG:
+ src = sample + (src_width * crop_y + crop_x) * 3;
+ r = RGB24ToARGB(src, src_width * 3,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_RAW:
+ src = sample + (src_width * crop_y + crop_x) * 3;
+ r = RAWToARGB(src, src_width * 3,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_ARGB:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = ARGBToARGB(src, src_width * 4,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_BGRA:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = BGRAToARGB(src, src_width * 4,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_ABGR:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = ABGRToARGB(src, src_width * 4,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_RGBA:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = RGBAToARGB(src, src_width * 4,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_RGBP:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = RGB565ToARGB(src, src_width * 2,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_RGBO:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = ARGB1555ToARGB(src, src_width * 2,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_R444:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = ARGB4444ToARGB(src, src_width * 2,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+ // TODO(fbarchard): Support cropping Bayer by odd numbers
+ // by adjusting fourcc.
+ case FOURCC_BGGR:
+ src = sample + (src_width * crop_y + crop_x);
+ r = BayerBGGRToARGB(src, src_width,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+
+ case FOURCC_GBRG:
+ src = sample + (src_width * crop_y + crop_x);
+ r = BayerGBRGToARGB(src, src_width,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+
+ case FOURCC_GRBG:
+ src = sample + (src_width * crop_y + crop_x);
+ r = BayerGRBGToARGB(src, src_width,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+
+ case FOURCC_RGGB:
+ src = sample + (src_width * crop_y + crop_x);
+ r = BayerRGGBToARGB(src, src_width,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+
+ case FOURCC_I400:
+ src = sample + src_width * crop_y + crop_x;
+ r = I400ToARGB(src, src_width,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+
+ // Biplanar formats
+ case FOURCC_NV12:
+ src = sample + (src_width * crop_y + crop_x);
+ src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+ r = NV12ToARGB(src, src_width,
+ src_uv, aligned_src_width,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_NV21:
+ src = sample + (src_width * crop_y + crop_x);
+ src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+ // Call NV12 but with u and v parameters swapped.
+ r = NV21ToARGB(src, src_width,
+ src_uv, aligned_src_width,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_M420:
+ src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
+ r = M420ToARGB(src, src_width,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+// case FOURCC_Q420:
+// src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x;
+// src_uv = sample + (src_width + aligned_src_width * 2) * crop_y +
+// src_width + crop_x * 2;
+// r = Q420ToARGB(src, src_width * 3,
+// src_uv, src_width * 3,
+// dst_argb, argb_stride,
+// dst_width, inv_dst_height);
+// break;
+ // Triplanar formats
+ case FOURCC_I420:
+ case FOURCC_YU12:
+ case FOURCC_YV12: {
+ const uint8* src_y = sample + (src_width * crop_y + crop_x);
+ const uint8* src_u;
+ const uint8* src_v;
+ int halfwidth = (src_width + 1) / 2;
+ int halfheight = (abs_src_height + 1) / 2;
+ if (format == FOURCC_YV12) {
+ src_v = sample + src_width * abs_src_height +
+ (halfwidth * crop_y + crop_x) / 2;
+ src_u = sample + src_width * abs_src_height +
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ } else {
+ src_u = sample + src_width * abs_src_height +
+ (halfwidth * crop_y + crop_x) / 2;
+ src_v = sample + src_width * abs_src_height +
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ }
+ r = I420ToARGB(src_y, src_width,
+ src_u, halfwidth,
+ src_v, halfwidth,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+ }
+ case FOURCC_I422:
+ case FOURCC_YV16: {
+ const uint8* src_y = sample + src_width * crop_y + crop_x;
+ const uint8* src_u;
+ const uint8* src_v;
+ int halfwidth = (src_width + 1) / 2;
+ if (format == FOURCC_YV16) {
+ src_v = sample + src_width * abs_src_height +
+ halfwidth * crop_y + crop_x / 2;
+ src_u = sample + src_width * abs_src_height +
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ } else {
+ src_u = sample + src_width * abs_src_height +
+ halfwidth * crop_y + crop_x / 2;
+ src_v = sample + src_width * abs_src_height +
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ }
+ r = I422ToARGB(src_y, src_width,
+ src_u, halfwidth,
+ src_v, halfwidth,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+ }
+ case FOURCC_I444:
+ case FOURCC_YV24: {
+ const uint8* src_y = sample + src_width * crop_y + crop_x;
+ const uint8* src_u;
+ const uint8* src_v;
+ if (format == FOURCC_YV24) {
+ src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ } else {
+ src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ }
+ r = I444ToARGB(src_y, src_width,
+ src_u, src_width,
+ src_v, src_width,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+ }
+ case FOURCC_I411: {
+ int quarterwidth = (src_width + 3) / 4;
+ const uint8* src_y = sample + src_width * crop_y + crop_x;
+ const uint8* src_u = sample + src_width * abs_src_height +
+ quarterwidth * crop_y + crop_x / 4;
+ const uint8* src_v = sample + src_width * abs_src_height +
+ quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
+ r = I411ToARGB(src_y, src_width,
+ src_u, quarterwidth,
+ src_v, quarterwidth,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+ }
+#ifdef HAVE_JPEG
+ case FOURCC_MJPG:
+ r = MJPGToARGB(sample, sample_size,
+ dst_argb, argb_stride,
+ src_width, abs_src_height, dst_width, inv_dst_height);
+ break;
+#endif
+ default:
+ r = -1; // unknown fourcc - return failure code.
+ }
+
+ if (need_buf) {
+ if (!r) {
+ r = ARGBRotate(dst_argb, argb_stride,
+ tmp_argb, tmp_argb_stride,
+ dst_width, abs_dst_height, rotation);
+ }
+ delete buf;
+ }
+
+ return r;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/convert_from.cc b/files/source/convert_from.cc
new file mode 100644
index 00000000..4ea974ac
--- /dev/null
+++ b/files/source/convert_from.cc
@@ -0,0 +1,1425 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_from.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/convert.h" // For I420Copy
+#include "libyuv/cpu_id.h"
+#include "libyuv/format_conversion.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/video_common.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+LIBYUV_API
+int I420ToI422(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_u = dst_u + (height - 1) * dst_stride_u;
+ dst_v = dst_v + (height - 1) * dst_stride_v;
+ dst_stride_y = -dst_stride_y;
+ dst_stride_u = -dst_stride_u;
+ dst_stride_v = -dst_stride_v;
+ }
+ int halfwidth = (width + 1) >> 1;
+ void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_COPYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 64)) {
+ CopyRow = CopyRow_NEON;
+ }
+#elif defined(HAS_COPYROW_X86)
+ if (IS_ALIGNED(halfwidth, 4)) {
+ CopyRow = CopyRow_X86;
+#if defined(HAS_COPYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(halfwidth, 32) &&
+ IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
+ IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
+ IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
+ IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
+ CopyRow = CopyRow_SSE2;
+ }
+#endif
+ }
+#endif
+
+ // Copy Y plane
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+
+ // UpSample U plane.
+ int y;
+ for (y = 0; y < height - 1; y += 2) {
+ CopyRow(src_u, dst_u, halfwidth);
+ CopyRow(src_u, dst_u + dst_stride_u, halfwidth);
+ src_u += src_stride_u;
+ dst_u += dst_stride_u * 2;
+ }
+ if (height & 1) {
+ CopyRow(src_u, dst_u, halfwidth);
+ }
+
+ // UpSample V plane.
+ for (y = 0; y < height - 1; y += 2) {
+ CopyRow(src_v, dst_v, halfwidth);
+ CopyRow(src_v, dst_v + dst_stride_v, halfwidth);
+ src_v += src_stride_v;
+ dst_v += dst_stride_v * 2;
+ }
+ if (height & 1) {
+ CopyRow(src_v, dst_v, halfwidth);
+ }
+ return 0;
+}
+
+// use Bilinear for upsampling chroma
+void ScalePlaneBilinear(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr);
+
+LIBYUV_API
+int I420ToI444(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_y || !src_u|| !src_v ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_u = dst_u + (height - 1) * dst_stride_u;
+ dst_v = dst_v + (height - 1) * dst_stride_v;
+ dst_stride_y = -dst_stride_y;
+ dst_stride_u = -dst_stride_u;
+ dst_stride_v = -dst_stride_v;
+ }
+
+ // Copy Y plane
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+
+ // Upsample U plane.
+ ScalePlaneBilinear(halfwidth, halfheight,
+ width, height,
+ src_stride_u,
+ dst_stride_u,
+ src_u, dst_u);
+
+ // Upsample V plane.
+ ScalePlaneBilinear(halfwidth, halfheight,
+ width, height,
+ src_stride_v,
+ dst_stride_v,
+ src_v, dst_v);
+ return 0;
+}
+
+// 420 chroma is 1/2 width, 1/2 height
+// 411 chroma is 1/4 width, 1x height
+LIBYUV_API
+int I420ToI411(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_u = dst_u + (height - 1) * dst_stride_u;
+ dst_v = dst_v + (height - 1) * dst_stride_v;
+ dst_stride_y = -dst_stride_y;
+ dst_stride_u = -dst_stride_u;
+ dst_stride_v = -dst_stride_v;
+ }
+
+ // Copy Y plane
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ int quarterwidth = (width + 3) >> 2;
+
+ // Resample U plane.
+ ScalePlaneBilinear(halfwidth, halfheight, // from 1/2 width, 1/2 height
+ quarterwidth, height, // to 1/4 width, 1x height
+ src_stride_u,
+ dst_stride_u,
+ src_u, dst_u);
+
+ // Resample V plane.
+ ScalePlaneBilinear(halfwidth, halfheight, // from 1/2 width, 1/2 height
+ quarterwidth, height, // to 1/4 width, 1x height
+ src_stride_v,
+ dst_stride_v,
+ src_v, dst_v);
+ return 0;
+}
+
+// Copy to I400. Source can be I420,422,444,400,NV12,NV21
+LIBYUV_API
+int I400Copy(const uint8* src_y, int src_stride_y,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height) {
+ if (!src_y || !dst_y ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ return 0;
+}
+
+// YUY2 - Macro-pixel = 2 image pixels
+// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
+
+// UYVY - Macro-pixel = 2 image pixels
+// U0Y0V0Y1
+
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
+#define HAS_I42XTOYUY2ROW_SSE2
+__declspec(naked) __declspec(align(16))
+static void I42xToYUY2Row_SSE2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_frame, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_y
+ mov esi, [esp + 8 + 8] // src_u
+ mov edx, [esp + 8 + 12] // src_v
+ mov edi, [esp + 8 + 16] // dst_frame
+ mov ecx, [esp + 8 + 20] // width
+ sub edx, esi
+
+ align 16
+ convertloop:
+ movq xmm2, qword ptr [esi] // U
+ movq xmm3, qword ptr [esi + edx] // V
+ lea esi, [esi + 8]
+ punpcklbw xmm2, xmm3 // UV
+ movdqa xmm0, [eax] // Y
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm2 // YUYV
+ punpckhbw xmm1, xmm2
+ movdqa [edi], xmm0
+ movdqa [edi + 16], xmm1
+ lea edi, [edi + 32]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+#define HAS_I42XTOUYVYROW_SSE2
+__declspec(naked) __declspec(align(16))
+static void I42xToUYVYRow_SSE2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_frame, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_y
+ mov esi, [esp + 8 + 8] // src_u
+ mov edx, [esp + 8 + 12] // src_v
+ mov edi, [esp + 8 + 16] // dst_frame
+ mov ecx, [esp + 8 + 20] // width
+ sub edx, esi
+
+ align 16
+ convertloop:
+ movq xmm2, qword ptr [esi] // U
+ movq xmm3, qword ptr [esi + edx] // V
+ lea esi, [esi + 8]
+ punpcklbw xmm2, xmm3 // UV
+ movdqa xmm0, [eax] // Y
+ movdqa xmm1, xmm2
+ lea eax, [eax + 16]
+ punpcklbw xmm1, xmm0 // UYVY
+ punpckhbw xmm2, xmm0
+ movdqa [edi], xmm1
+ movdqa [edi + 16], xmm2
+ lea edi, [edi + 32]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
+#define HAS_I42XTOYUY2ROW_SSE2
+static void I42xToYUY2Row_SSE2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_frame, int width) {
+ asm volatile (
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movq (%1),%%xmm2 \n"
+ "movq (%1,%2,1),%%xmm3 \n"
+ "lea 0x8(%1),%1 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "movdqa (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,(%3) \n"
+ "movdqa %%xmm1,0x10(%3) \n"
+ "lea 0x20(%3),%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_frame), // %3
+ "+rm"(width) // %4
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
+ );
+}
+
+#define HAS_I42XTOUYVYROW_SSE2
+static void I42xToUYVYRow_SSE2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_frame, int width) {
+ asm volatile (
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movq (%1),%%xmm2 \n"
+ "movq (%1,%2,1),%%xmm3 \n"
+ "lea 0x8(%1),%1 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,(%3) \n"
+ "movdqa %%xmm2,0x10(%3) \n"
+ "lea 0x20(%3),%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_frame), // %3
+ "+rm"(width) // %4
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
+ );
+}
+#endif
+
+static void I42xToYUY2Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_frame, int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ dst_frame[0] = src_y[0];
+ dst_frame[1] = src_u[0];
+ dst_frame[2] = src_y[1];
+ dst_frame[3] = src_v[0];
+ dst_frame += 4;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ }
+ if (width & 1) {
+ dst_frame[0] = src_y[0];
+ dst_frame[1] = src_u[0];
+ dst_frame[2] = src_y[0]; // duplicate last y
+ dst_frame[3] = src_v[0];
+ }
+}
+
+static void I42xToUYVYRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_frame, int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ dst_frame[0] = src_u[0];
+ dst_frame[1] = src_y[0];
+ dst_frame[2] = src_v[0];
+ dst_frame[3] = src_y[1];
+ dst_frame += 4;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ }
+ if (width & 1) {
+ dst_frame[0] = src_u[0];
+ dst_frame[1] = src_y[0];
+ dst_frame[2] = src_v[0];
+ dst_frame[3] = src_y[0]; // duplicate last y
+ }
+}
+
+// Visual C x86 or GCC little endian.
+#if defined(__x86_64__) || defined(_M_X64) || \
+ defined(__i386__) || defined(_M_IX86) || \
+ defined(__arm__) || defined(_M_ARM) || \
+ (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#define LIBYUV_LITTLE_ENDIAN
+#endif
+
+#ifdef LIBYUV_LITTLE_ENDIAN
+#define WRITEWORD(p, v) *reinterpret_cast<uint32*>(p) = v
+#else
+static inline void WRITEWORD(uint8* p, uint32 v) {
+ p[0] = (uint8)(v & 255);
+ p[1] = (uint8)((v >> 8) & 255);
+ p[2] = (uint8)((v >> 16) & 255);
+ p[3] = (uint8)((v >> 24) & 255);
+}
+#endif
+
+#define EIGHTTOTEN(x) (x << 2 | x >> 6)
+static void UYVYToV210Row_C(const uint8* src_uyvy, uint8* dst_v210, int width) {
+ for (int x = 0; x < width; x += 6) {
+ WRITEWORD(dst_v210 + 0, (EIGHTTOTEN(src_uyvy[0])) |
+ (EIGHTTOTEN(src_uyvy[1]) << 10) |
+ (EIGHTTOTEN(src_uyvy[2]) << 20));
+ WRITEWORD(dst_v210 + 4, (EIGHTTOTEN(src_uyvy[3])) |
+ (EIGHTTOTEN(src_uyvy[4]) << 10) |
+ (EIGHTTOTEN(src_uyvy[5]) << 20));
+ WRITEWORD(dst_v210 + 8, (EIGHTTOTEN(src_uyvy[6])) |
+ (EIGHTTOTEN(src_uyvy[7]) << 10) |
+ (EIGHTTOTEN(src_uyvy[8]) << 20));
+ WRITEWORD(dst_v210 + 12, (EIGHTTOTEN(src_uyvy[9])) |
+ (EIGHTTOTEN(src_uyvy[10]) << 10) |
+ (EIGHTTOTEN(src_uyvy[11]) << 20));
+ src_uyvy += 12;
+ dst_v210 += 16;
+ }
+}
+
+// TODO(fbarchard): Deprecate, move or expand 422 support?
+LIBYUV_API
+int I422ToYUY2(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v || !dst_frame ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_frame = dst_frame + (height - 1) * dst_stride_frame;
+ dst_stride_frame = -dst_stride_frame;
+ }
+ void (*I42xToYUY2Row)(const uint8* src_y, const uint8* src_u,
+ const uint8* src_v, uint8* dst_frame, int width) =
+ I42xToYUY2Row_C;
+#if defined(HAS_I42XTOYUY2ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
+ IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) {
+ I42xToYUY2Row = I42xToYUY2Row_SSE2;
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ I42xToYUY2Row(src_y, src_u, src_y, dst_frame, width);
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_frame += dst_stride_frame;
+ }
+ return 0;
+}
+
+LIBYUV_API
+int I420ToYUY2(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v || !dst_frame ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_frame = dst_frame + (height - 1) * dst_stride_frame;
+ dst_stride_frame = -dst_stride_frame;
+ }
+ void (*I42xToYUY2Row)(const uint8* src_y, const uint8* src_u,
+ const uint8* src_v, uint8* dst_frame, int width) =
+ I42xToYUY2Row_C;
+#if defined(HAS_I42XTOYUY2ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
+ IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) {
+ I42xToYUY2Row = I42xToYUY2Row_SSE2;
+ }
+#endif
+
+ for (int y = 0; y < height - 1; y += 2) {
+ I42xToYUY2Row(src_y, src_u, src_v, dst_frame, width);
+ I42xToYUY2Row(src_y + src_stride_y, src_u, src_v,
+ dst_frame + dst_stride_frame, width);
+ src_y += src_stride_y * 2;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_frame += dst_stride_frame * 2;
+ }
+ if (height & 1) {
+ I42xToYUY2Row(src_y, src_u, src_v, dst_frame, width);
+ }
+ return 0;
+}
+
+// TODO(fbarchard): Deprecate, move or expand 422 support?
+LIBYUV_API
+int I422ToUYVY(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v || !dst_frame ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_frame = dst_frame + (height - 1) * dst_stride_frame;
+ dst_stride_frame = -dst_stride_frame;
+ }
+ void (*I42xToUYVYRow)(const uint8* src_y, const uint8* src_u,
+ const uint8* src_v, uint8* dst_frame, int width) =
+ I42xToUYVYRow_C;
+#if defined(HAS_I42XTOUYVYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
+ IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) {
+ I42xToUYVYRow = I42xToUYVYRow_SSE2;
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ I42xToUYVYRow(src_y, src_u, src_y, dst_frame, width);
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_frame += dst_stride_frame;
+ }
+ return 0;
+}
+
+LIBYUV_API
+int I420ToUYVY(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v || !dst_frame ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_frame = dst_frame + (height - 1) * dst_stride_frame;
+ dst_stride_frame = -dst_stride_frame;
+ }
+ void (*I42xToUYVYRow)(const uint8* src_y, const uint8* src_u,
+ const uint8* src_v, uint8* dst_frame, int width) =
+ I42xToUYVYRow_C;
+#if defined(HAS_I42XTOUYVYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
+ IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) {
+ I42xToUYVYRow = I42xToUYVYRow_SSE2;
+ }
+#endif
+
+ for (int y = 0; y < height - 1; y += 2) {
+ I42xToUYVYRow(src_y, src_u, src_v, dst_frame, width);
+ I42xToUYVYRow(src_y + src_stride_y, src_u, src_v,
+ dst_frame + dst_stride_frame, width);
+ src_y += src_stride_y * 2;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_frame += dst_stride_frame * 2;
+ }
+ if (height & 1) {
+ I42xToUYVYRow(src_y, src_u, src_v, dst_frame, width);
+ }
+ return 0;
+}
+
+LIBYUV_API
+int I420ToV210(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height) {
+ if (width * 16 / 6 > kMaxStride) { // Row buffer of V210 is required.
+ return -1;
+ } else if (!src_y || !src_u || !src_v || !dst_frame ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_frame = dst_frame + (height - 1) * dst_stride_frame;
+ dst_stride_frame = -dst_stride_frame;
+ }
+
+ SIMD_ALIGNED(uint8 row[kMaxStride]);
+ void (*UYVYToV210Row)(const uint8* src_uyvy, uint8* dst_v210, int pix);
+ UYVYToV210Row = UYVYToV210Row_C;
+
+ void (*I42xToUYVYRow)(const uint8* src_y, const uint8* src_u,
+ const uint8* src_v, uint8* dst_frame, int width) =
+ I42xToUYVYRow_C;
+#if defined(HAS_I42XTOUYVYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16)) {
+ I42xToUYVYRow = I42xToUYVYRow_SSE2;
+ }
+#endif
+
+ for (int y = 0; y < height - 1; y += 2) {
+ I42xToUYVYRow(src_y, src_u, src_v, row, width);
+ UYVYToV210Row(row, dst_frame, width);
+ I42xToUYVYRow(src_y + src_stride_y, src_u, src_v, row, width);
+ UYVYToV210Row(row, dst_frame + dst_stride_frame, width);
+
+ src_y += src_stride_y * 2;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_frame += dst_stride_frame * 2;
+ }
+ if (height & 1) {
+ I42xToUYVYRow(src_y, src_u, src_v, row, width);
+ UYVYToV210Row(row, dst_frame, width);
+ }
+ return 0;
+}
+
+// Convert I420 to ARGB.
+LIBYUV_API
+int I420ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ void (*I422ToARGBRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGBRow = I422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGBRow = I422ToARGBRow_NEON;
+ }
+ }
+#elif defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
+ }
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to BGRA.
+LIBYUV_API
+int I420ToBGRA(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_bgra, int dst_stride_bgra,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_bgra ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;
+ dst_stride_bgra = -dst_stride_bgra;
+ }
+ void (*I422ToBGRARow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToBGRARow_C;
+#if defined(HAS_I422TOBGRAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToBGRARow = I422ToBGRARow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToBGRARow = I422ToBGRARow_NEON;
+ }
+ }
+#elif defined(HAS_I422TOBGRAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {
+ I422ToBGRARow = I422ToBGRARow_SSSE3;
+ }
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);
+ dst_bgra += dst_stride_bgra;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to ABGR.
+LIBYUV_API
+int I420ToABGR(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_abgr, int dst_stride_abgr,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_abgr ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
+ dst_stride_abgr = -dst_stride_abgr;
+ }
+ void (*I422ToABGRRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToABGRRow_C;
+#if defined(HAS_I422TOABGRROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToABGRRow = I422ToABGRRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToABGRRow = I422ToABGRRow_NEON;
+ }
+ }
+#elif defined(HAS_I422TOABGRROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {
+ I422ToABGRRow = I422ToABGRRow_SSSE3;
+ }
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
+ dst_abgr += dst_stride_abgr;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RGBA.
+LIBYUV_API
+int I420ToRGBA(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_rgba, int dst_stride_rgba,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_rgba ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+ dst_stride_rgba = -dst_stride_rgba;
+ }
+ void (*I422ToRGBARow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToRGBARow_C;
+#if defined(HAS_I422TORGBAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGBARow = I422ToRGBARow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGBARow = I422ToRGBARow_NEON;
+ }
+ }
+#elif defined(HAS_I422TORGBAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {
+ I422ToRGBARow = I422ToRGBARow_SSSE3;
+ }
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width);
+ dst_rgba += dst_stride_rgba;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RGB24.
+LIBYUV_API
+int I420ToRGB24(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_rgb24, int dst_stride_rgb24,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_rgb24 ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+ dst_stride_rgb24 = -dst_stride_rgb24;
+ }
+ void (*I422ToRGB24Row)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToRGB24Row_C;
+#if defined(HAS_I422TORGB24ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGB24Row = I422ToRGB24Row_NEON;
+ }
+ }
+#elif defined(HAS_I422TORGB24ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB24Row = I422ToRGB24Row_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) {
+ I422ToRGB24Row = I422ToRGB24Row_SSSE3;
+ }
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, width);
+ dst_rgb24 += dst_stride_rgb24;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RAW.
+LIBYUV_API
+int I420ToRAW(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_raw, int dst_stride_raw,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_raw ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_raw = dst_raw + (height - 1) * dst_stride_raw;
+ dst_stride_raw = -dst_stride_raw;
+ }
+ void (*I422ToRAWRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToRAWRow_C;
+#if defined(HAS_I422TORAWROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRAWRow = I422ToRAWRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRAWRow = I422ToRAWRow_NEON;
+ }
+ }
+#elif defined(HAS_I422TORAWROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToRAWRow = I422ToRAWRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRAWRow = I422ToRAWRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) {
+ I422ToRAWRow = I422ToRAWRow_SSSE3;
+ }
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ I422ToRAWRow(src_y, src_u, src_v, dst_raw, width);
+ dst_raw += dst_stride_raw;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RGB565.
+LIBYUV_API
+int I420ToRGB565(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_rgb, int dst_stride_rgb,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_rgb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb;
+ dst_stride_rgb = -dst_stride_rgb;
+ }
+ void (*I422ToARGBRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGBRow = I422ToARGBRow_NEON;
+ }
+#elif defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
+ }
+#endif
+
+ SIMD_ALIGNED(uint8 row[kMaxStride]);
+ void (*ARGBToRGB565Row)(const uint8* src_rgb, uint8* dst_rgb, int pix) =
+ ARGBToRGB565Row_C;
+#if defined(HAS_ARGBTORGB565ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ if (width * 2 <= kMaxStride) {
+ ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
+ }
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ I422ToARGBRow(src_y, src_u, src_v, row, width);
+ ARGBToRGB565Row(row, dst_rgb, width);
+ dst_rgb += dst_stride_rgb;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to ARGB1555.
+LIBYUV_API
+int I420ToARGB1555(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ void (*I422ToARGBRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGBRow = I422ToARGBRow_NEON;
+ }
+#elif defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
+ }
+#endif
+
+ SIMD_ALIGNED(uint8 row[kMaxStride]);
+ void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+ ARGBToARGB1555Row_C;
+#if defined(HAS_ARGBTOARGB1555ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ if (width * 2 <= kMaxStride) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2;
+ }
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ I422ToARGBRow(src_y, src_u, src_v, row, width);
+ ARGBToARGB1555Row(row, dst_argb, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to ARGB4444.
+LIBYUV_API
+int I420ToARGB4444(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ void (*I422ToARGBRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGBRow = I422ToARGBRow_NEON;
+ }
+#elif defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
+ }
+#endif
+
+ SIMD_ALIGNED(uint8 row[kMaxStride]);
+ void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+ ARGBToARGB4444Row_C;
+#if defined(HAS_ARGBTOARGB4444ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ if (width * 2 <= kMaxStride) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2;
+ }
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ I422ToARGBRow(src_y, src_u, src_v, row, width);
+ ARGBToARGB4444Row(row, dst_argb, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to specified format
+LIBYUV_API
+int ConvertFromI420(const uint8* y, int y_stride,
+ const uint8* u, int u_stride,
+ const uint8* v, int v_stride,
+ uint8* dst_sample, int dst_sample_stride,
+ int width, int height,
+ uint32 format) {
+ if (!y || !u|| !v || !dst_sample ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ int r = 0;
+ switch (format) {
+ // Single plane formats
+ case FOURCC_YUY2:
+ r = I420ToYUY2(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 2,
+ width, height);
+ break;
+ case FOURCC_UYVY:
+ r = I420ToUYVY(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 2,
+ width, height);
+ break;
+ case FOURCC_V210:
+ r = I420ToV210(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride :
+ (width + 47) / 48 * 128,
+ width, height);
+ break;
+ case FOURCC_RGBP:
+ r = I420ToRGB565(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 2,
+ width, height);
+ break;
+ case FOURCC_RGBO:
+ r = I420ToARGB1555(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 2,
+ width, height);
+ break;
+ case FOURCC_R444:
+ r = I420ToARGB4444(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 2,
+ width, height);
+ break;
+ case FOURCC_24BG:
+ r = I420ToRGB24(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 3,
+ width, height);
+ break;
+ case FOURCC_RAW:
+ r = I420ToRAW(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 3,
+ width, height);
+ break;
+ case FOURCC_ARGB:
+ r = I420ToARGB(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 4,
+ width, height);
+ break;
+ case FOURCC_BGRA:
+ r = I420ToBGRA(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 4,
+ width, height);
+ break;
+ case FOURCC_ABGR:
+ r = I420ToABGR(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 4,
+ width, height);
+ break;
+ case FOURCC_RGBA:
+ r = I420ToRGBA(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 4,
+ width, height);
+ break;
+ case FOURCC_BGGR:
+ r = I420ToBayerBGGR(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width,
+ width, height);
+ break;
+ case FOURCC_GBRG:
+ r = I420ToBayerGBRG(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width,
+ width, height);
+ break;
+ case FOURCC_GRBG:
+ r = I420ToBayerGRBG(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width,
+ width, height);
+ break;
+ case FOURCC_RGGB:
+ r = I420ToBayerRGGB(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width,
+ width, height);
+ break;
+ case FOURCC_I400:
+ r = I400Copy(y, y_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width,
+ width, height);
+ break;
+ // Triplanar formats
+ // TODO(fbarchard): halfstride instead of halfwidth
+ case FOURCC_I420:
+ case FOURCC_YU12:
+ case FOURCC_YV12: {
+ int halfwidth = (width + 1) / 2;
+ int halfheight = (height + 1) / 2;
+ uint8* dst_u;
+ uint8* dst_v;
+ if (format == FOURCC_YV12) {
+ dst_v = dst_sample + width * height;
+ dst_u = dst_v + halfwidth * halfheight;
+ } else {
+ dst_u = dst_sample + width * height;
+ dst_v = dst_u + halfwidth * halfheight;
+ }
+ r = I420Copy(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample, width,
+ dst_u, halfwidth,
+ dst_v, halfwidth,
+ width, height);
+ break;
+ }
+ case FOURCC_I422:
+ case FOURCC_YV16: {
+ int halfwidth = (width + 1) / 2;
+ uint8* dst_u;
+ uint8* dst_v;
+ if (format == FOURCC_YV16) {
+ dst_v = dst_sample + width * height;
+ dst_u = dst_v + halfwidth * height;
+ } else {
+ dst_u = dst_sample + width * height;
+ dst_v = dst_u + halfwidth * height;
+ }
+ r = I420ToI422(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample, width,
+ dst_u, halfwidth,
+ dst_v, halfwidth,
+ width, height);
+ break;
+ }
+ case FOURCC_I444:
+ case FOURCC_YV24: {
+ uint8* dst_u;
+ uint8* dst_v;
+ if (format == FOURCC_YV24) {
+ dst_v = dst_sample + width * height;
+ dst_u = dst_v + width * height;
+ } else {
+ dst_u = dst_sample + width * height;
+ dst_v = dst_u + width * height;
+ }
+ r = I420ToI444(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample, width,
+ dst_u, width,
+ dst_v, width,
+ width, height);
+ break;
+ }
+ case FOURCC_I411: {
+ int quarterwidth = (width + 3) / 4;
+ uint8* dst_u = dst_sample + width * height;
+ uint8* dst_v = dst_u + quarterwidth * height;
+ r = I420ToI411(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample, width,
+ dst_u, quarterwidth,
+ dst_v, quarterwidth,
+ width, height);
+ break;
+ }
+
+ // Formats not supported - MJPG, biplanar, some rgb formats.
+ default:
+ return -1; // unknown fourcc - return failure code.
+ }
+ return r;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/cpu_id.cc b/files/source/cpu_id.cc
index cc44e215..2e96d9b9 100644
--- a/files/source/cpu_id.cc
+++ b/files/source/cpu_id.cc
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@@ -9,66 +9,206 @@
*/
#include "libyuv/cpu_id.h"
-#include "libyuv/basic_types.h" // for CPU_X86
#ifdef _MSC_VER
-#include <intrin.h>
+#include <intrin.h> // For __cpuid()
#endif
+#if !defined(__CLR_VER) && defined(_M_X64) && \
+ defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
+#include <immintrin.h> // For _xgetbv()
+#endif
+
+#include <stdlib.h> // For getenv()
+
+// For ArmCpuCaps() but unittested on all platforms
+#include <stdio.h>
+#include <string.h>
+
+#include "libyuv/basic_types.h" // For CPU_X86
// TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux.
#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
-static inline void __cpuid(int cpu_info[4], int info_type) {
- __asm__ volatile (
- "mov %%ebx, %%edi\n"
- "cpuid\n"
- "xchg %%edi, %%ebx\n"
+static __inline void __cpuid(int cpu_info[4], int info_type) {
+ asm volatile ( // NOLINT
+ "mov %%ebx, %%edi \n"
+ "cpuid \n"
+ "xchg %%edi, %%ebx \n"
: "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
- : "a"(info_type)
- );
+ : "a"(info_type));
}
#elif defined(__i386__) || defined(__x86_64__)
-static inline void __cpuid(int cpu_info[4], int info_type) {
- __asm__ volatile (
- "cpuid\n"
+static __inline void __cpuid(int cpu_info[4], int info_type) {
+ asm volatile ( // NOLINT
+ "cpuid \n"
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
- : "a"(info_type)
- );
+ : "a"(info_type));
}
#endif
+#ifdef __cplusplus
namespace libyuv {
+extern "C" {
+#endif
+
+// Low level cpuid for X86. Returns zeros on other CPUs.
+#if !defined(__CLR_VER) && (defined(_M_IX86) || defined(_M_X64) || \
+ defined(__i386__) || defined(__x86_64__))
+LIBYUV_API
+void CpuId(int cpu_info[4], int info_type) {
+ __cpuid(cpu_info, info_type);
+}
+#else
+LIBYUV_API
+void CpuId(int cpu_info[4], int) {
+ cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
+}
+#endif
+
+// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
+#if !defined(__CLR_VER) && defined(_M_X64) && \
+ defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
+#define HAS_XGETBV
+static uint32 XGetBV(unsigned int xcr) {
+ return static_cast<uint32>(_xgetbv(xcr));
+}
+#elif !defined(__CLR_VER) && defined(_M_IX86)
+#define HAS_XGETBV
+__declspec(naked) __declspec(align(16))
+static uint32 XGetBV(unsigned int xcr) {
+ __asm {
+ mov ecx, [esp + 4] // xcr
+ _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // xgetbv for vs2005.
+ ret
+ }
+}
+#elif defined(__i386__) || defined(__x86_64__)
+#define HAS_XGETBV
+static uint32 XGetBV(unsigned int xcr) {
+ uint32 xcr_feature_mask;
+ asm volatile ( // NOLINT
+ ".byte 0x0f, 0x01, 0xd0\n"
+ : "=a"(xcr_feature_mask)
+ : "c"(xcr)
+ : "memory", "cc", "edx"); // edx unused.
+ return xcr_feature_mask;
+}
+#endif
+#ifdef HAS_XGETBV
+static const int kXCR_XFEATURE_ENABLED_MASK = 0;
+#endif
+
+// based on libvpx arm_cpudetect.c
+// For Arm, but public to allow testing on any CPU
+LIBYUV_API
+int ArmCpuCaps(const char* cpuinfo_name) {
+ int flags = 0;
+ FILE* fin = fopen(cpuinfo_name, "r");
+ if (fin) {
+ char buf[512];
+ while (fgets(buf, 511, fin)) {
+ if (memcmp(buf, "Features", 8) == 0) {
+ flags |= kCpuInitialized;
+ char* p = strstr(buf, " neon");
+ if (p && (p[5] == ' ' || p[5] == '\n')) {
+ flags |= kCpuHasNEON;
+ break;
+ }
+ }
+ }
+ fclose(fin);
+ }
+ return flags;
+}
// CPU detect function for SIMD instruction sets.
-static int cpu_info_ = 0;
+LIBYUV_API
+int cpu_info_ = 0;
-// TODO(fbarchard): (cpu_info[2] & 0x10000000 ? kCpuHasAVX : 0)
-static void InitCpuFlags() {
-#ifdef CPU_X86
+// Test environment variable for disabling CPU features. Any non-zero value
+// to disable. Zero ignored to make it easy to set the variable on/off.
+static bool TestEnv(const char* name) {
+ const char* var = getenv(name);
+ if (var) {
+ if (var[0] != '0') {
+ return true;
+ }
+ }
+ return false;
+}
+
+LIBYUV_API
+int InitCpuFlags(void) {
+#if !defined(__CLR_VER) && defined(CPU_X86)
int cpu_info[4];
__cpuid(cpu_info, 1);
- cpu_info_ = (cpu_info[3] & 0x04000000 ? kCpuHasSSE2 : 0) |
- (cpu_info[2] & 0x00000200 ? kCpuHasSSSE3 : 0) |
- kCpuInitialized;
+ cpu_info_ = ((cpu_info[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
+ ((cpu_info[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
+ ((cpu_info[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
+ ((cpu_info[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
+ (((cpu_info[2] & 0x18000000) == 0x18000000) ? kCpuHasAVX : 0) |
+ kCpuInitialized | kCpuHasX86;
+#ifdef HAS_XGETBV
+ if (cpu_info_ & kCpuHasAVX) {
+ __cpuid(cpu_info, 7);
+ if ((cpu_info[1] & 0x00000020) &&
+ ((XGetBV(kXCR_XFEATURE_ENABLED_MASK) & 0x06) == 0x06)) {
+ cpu_info_ |= kCpuHasAVX2;
+ }
+ }
+#endif
+ // environment variable overrides for testing.
+ if (TestEnv("LIBYUV_DISABLE_X86")) {
+ cpu_info_ &= ~kCpuHasX86;
+ }
+ if (TestEnv("LIBYUV_DISABLE_SSE2")) {
+ cpu_info_ &= ~kCpuHasSSE2;
+ }
+ if (TestEnv("LIBYUV_DISABLE_SSSE3")) {
+ cpu_info_ &= ~kCpuHasSSSE3;
+ }
+ if (TestEnv("LIBYUV_DISABLE_SSE41")) {
+ cpu_info_ &= ~kCpuHasSSE41;
+ }
+ if (TestEnv("LIBYUV_DISABLE_SSE42")) {
+ cpu_info_ &= ~kCpuHasSSE42;
+ }
+ if (TestEnv("LIBYUV_DISABLE_AVX")) {
+ cpu_info_ &= ~kCpuHasAVX;
+ }
+ if (TestEnv("LIBYUV_DISABLE_AVX2")) {
+ cpu_info_ &= ~kCpuHasAVX2;
+ }
+ if (TestEnv("LIBYUV_DISABLE_ASM")) {
+ cpu_info_ = kCpuInitialized;
+ }
+#elif defined(__arm__)
+#if defined(__linux__) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+ // linux arm parse text file for neon detect.
+ cpu_info_ = ArmCpuCaps("/proc/cpuinfo");
#elif defined(__ARM_NEON__)
// gcc -mfpu=neon defines __ARM_NEON__
// Enable Neon if you want support for Neon and Arm, and use MaskCpuFlags
// to disable Neon on devices that do not have it.
- cpu_info_ = kCpuHasNEON | kCpuInitialized;
-#else
- cpu_info_ = kCpuInitialized;
+ cpu_info_ = kCpuHasNEON;
#endif
+ cpu_info_ |= kCpuInitialized | kCpuHasARM;
+ if (TestEnv("LIBYUV_DISABLE_NEON")) {
+ cpu_info_ &= ~kCpuHasNEON;
+ }
+ if (TestEnv("LIBYUV_DISABLE_ASM")) {
+ cpu_info_ = kCpuInitialized;
+ }
+#endif // __arm__
+ return cpu_info_;
}
+LIBYUV_API
void MaskCpuFlags(int enable_flags) {
InitCpuFlags();
- cpu_info_ &= enable_flags;
-}
-
-bool TestCpuFlag(int flag) {
- if (0 == cpu_info_) {
- InitCpuFlags();
- }
- return cpu_info_ & flag ? true : false;
+ cpu_info_ = (cpu_info_ & enable_flags) | kCpuInitialized;
}
+#ifdef __cplusplus
+} // extern "C"
} // namespace libyuv
+#endif
diff --git a/files/source/format_conversion.cc b/files/source/format_conversion.cc
index 958f44c4..ed12de88 100644
--- a/files/source/format_conversion.cc
+++ b/files/source/format_conversion.cc
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@@ -8,66 +8,73 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include <assert.h>
+#include "libyuv/format_conversion.h"
+#include "libyuv/basic_types.h"
#include "libyuv/cpu_id.h"
-#include "video_common.h"
-#include "row.h"
-
-#define kMaxStride (2048 * 4)
+#include "libyuv/video_common.h"
+#include "libyuv/row.h"
+#ifdef __cplusplus
namespace libyuv {
+extern "C" {
+#endif
// Note: to do this with Neon vld4.8 would load ARGB values into 4 registers
-// and vst would select which 2 components to write. The low level would need
+// and vst would select which 2 components to write. The low level would need
// to be ARGBToBG, ARGBToGB, ARGBToRG, ARGBToGR
-#if defined(WIN32) && !defined(COVERAGE_ENABLED)
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_ARGBTOBAYERROW_SSSE3
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
uint8* dst_bayer, uint32 selector, int pix) {
__asm {
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_bayer
- movd xmm7, [esp + 12] // selector
+ movd xmm5, [esp + 12] // selector
mov ecx, [esp + 16] // pix
- pshufd xmm7, xmm7, 0
+ pshufd xmm5, xmm5, 0
+ align 16
wloop:
movdqa xmm0, [eax]
lea eax, [eax + 16]
- pshufb xmm0, xmm7
+ pshufb xmm0, xmm5
+ sub ecx, 4
movd [edx], xmm0
lea edx, [edx + 4]
- sub ecx, 4
- ja wloop
+ jg wloop
ret
}
}
-#elif (defined(__x86_64__) || defined(__i386__)) && \
- !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
#define HAS_ARGBTOBAYERROW_SSSE3
static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
uint32 selector, int pix) {
- asm volatile(
- "movd %3,%%xmm7\n"
- "pshufd $0x0,%%xmm7,%%xmm7\n"
-"1:"
- "movdqa (%0),%%xmm0\n"
- "lea 0x10(%0),%0\n"
- "pshufb %%xmm7,%%xmm0\n"
- "movd %%xmm0,(%1)\n"
- "lea 0x4(%1),%1\n"
- "sub $0x4,%2\n"
- "ja 1b\n"
+ asm volatile (
+ "movd %3,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ ".p2align 4 \n"
+"1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movd %%xmm0,(%1) \n"
+ "lea 0x4(%1),%1 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_bayer), // %1
"+r"(pix) // %2
- : "r"(selector) // %3
- : "memory"
+ : "g"(selector) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm5"
+#endif
+
);
}
#endif
@@ -77,7 +84,7 @@ static void ARGBToBayerRow_C(const uint8* src_argb,
int index0 = selector & 0xff;
int index1 = (selector >> 8) & 0xff;
// Copy a row of Bayer.
- for (int x = 0; x < (pix - 1); x += 2) {
+ for (int x = 0; x < pix - 1; x += 2) {
dst_bayer[0] = src_argb[index0];
dst_bayer[1] = src_argb[index1];
src_argb += 8;
@@ -96,243 +103,258 @@ static uint32 GenerateSelector(int select0, int select1) {
static_cast<uint32>((select1 + 12) << 24);
}
-// Converts 32 bit ARGB to any Bayer RGB format.
-int ARGBToBayerRGB(const uint8* src_rgb, int src_stride_rgb,
- uint8* dst_bayer, int dst_stride_bayer,
- uint32 dst_fourcc_bayer,
- int width, int height) {
- if (height < 0) {
- height = -height;
- src_rgb = src_rgb + (height - 1) * src_stride_rgb;
- src_stride_rgb = -src_stride_rgb;
- }
- void (*ARGBToBayerRow)(const uint8* src_argb,
- uint8* dst_bayer, uint32 selector, int pix);
-#if defined(HAS_ARGBTOBAYERROW_SSSE3)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
- (width % 4 == 0) &&
- IS_ALIGNED(src_rgb, 16) && (src_stride_rgb % 16 == 0) &&
- IS_ALIGNED(dst_bayer, 4) && (dst_stride_bayer % 4 == 0)) {
- ARGBToBayerRow = ARGBToBayerRow_SSSE3;
- } else
-#endif
- {
- ARGBToBayerRow = ARGBToBayerRow_C;
- }
-
- int blue_index = 0;
- int green_index = 1;
- int red_index = 2;
-
+static int MakeSelectors(const int blue_index,
+ const int green_index,
+ const int red_index,
+ uint32 dst_fourcc_bayer,
+ uint32 *index_map) {
// Now build a lookup table containing the indices for the four pixels in each
// 2x2 Bayer grid.
- uint32 index_map[2];
switch (dst_fourcc_bayer) {
- default:
- assert(false);
- case FOURCC_RGGB:
- index_map[0] = GenerateSelector(red_index, green_index);
- index_map[1] = GenerateSelector(green_index, blue_index);
- break;
case FOURCC_BGGR:
index_map[0] = GenerateSelector(blue_index, green_index);
index_map[1] = GenerateSelector(green_index, red_index);
break;
- case FOURCC_GRBG:
- index_map[0] = GenerateSelector(green_index, red_index);
- index_map[1] = GenerateSelector(blue_index, green_index);
- break;
case FOURCC_GBRG:
index_map[0] = GenerateSelector(green_index, blue_index);
index_map[1] = GenerateSelector(red_index, green_index);
break;
+ case FOURCC_RGGB:
+ index_map[0] = GenerateSelector(red_index, green_index);
+ index_map[1] = GenerateSelector(green_index, blue_index);
+ break;
+ case FOURCC_GRBG:
+ index_map[0] = GenerateSelector(green_index, red_index);
+ index_map[1] = GenerateSelector(blue_index, green_index);
+ break;
+ default:
+ return -1; // Bad FourCC
+ }
+ return 0;
+}
+
+// Converts 32 bit ARGB to Bayer RGB formats.
+LIBYUV_API
+int ARGBToBayer(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_bayer, int dst_stride_bayer,
+ int width, int height,
+ uint32 dst_fourcc_bayer) {
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
+ uint32 selector, int pix) = ARGBToBayerRow_C;
+#if defined(HAS_ARGBTOBAYERROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) &&
+ IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToBayerRow = ARGBToBayerRow_SSSE3;
+ }
+#endif
+ const int blue_index = 0; // Offsets for ARGB format
+ const int green_index = 1;
+ const int red_index = 2;
+ uint32 index_map[2];
+ if (MakeSelectors(blue_index, green_index, red_index,
+ dst_fourcc_bayer, index_map)) {
+ return -1; // Bad FourCC
}
- // Now convert.
for (int y = 0; y < height; ++y) {
- ARGBToBayerRow(src_rgb, dst_bayer, index_map[y & 1], width);
- src_rgb += src_stride_rgb;
+ ARGBToBayerRow(src_argb, dst_bayer, index_map[y & 1], width);
+ src_argb += src_stride_argb;
dst_bayer += dst_stride_bayer;
}
return 0;
}
-#define AVG(a,b) (((a) + (b)) >> 1)
+#define AVG(a, b) (((a) + (b)) >> 1)
static void BayerRowBG(const uint8* src_bayer0, int src_stride_bayer,
- uint8* dst_rgb, int pix) {
+ uint8* dst_argb, int pix) {
const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
uint8 g = src_bayer0[1];
uint8 r = src_bayer1[1];
- for (int x = 0; x < (pix - 2); x += 2) {
- dst_rgb[0] = src_bayer0[0];
- dst_rgb[1] = AVG(g, src_bayer0[1]);
- dst_rgb[2] = AVG(r, src_bayer1[1]);
- dst_rgb[3] = 255U;
- dst_rgb[4] = AVG(src_bayer0[0], src_bayer0[2]);
- dst_rgb[5] = src_bayer0[1];
- dst_rgb[6] = src_bayer1[1];
- dst_rgb[7] = 255U;
+ for (int x = 0; x < pix - 2; x += 2) {
+ dst_argb[0] = src_bayer0[0];
+ dst_argb[1] = AVG(g, src_bayer0[1]);
+ dst_argb[2] = AVG(r, src_bayer1[1]);
+ dst_argb[3] = 255U;
+ dst_argb[4] = AVG(src_bayer0[0], src_bayer0[2]);
+ dst_argb[5] = src_bayer0[1];
+ dst_argb[6] = src_bayer1[1];
+ dst_argb[7] = 255U;
g = src_bayer0[1];
r = src_bayer1[1];
src_bayer0 += 2;
src_bayer1 += 2;
- dst_rgb += 8;
- }
- dst_rgb[0] = src_bayer0[0];
- dst_rgb[1] = AVG(g, src_bayer0[1]);
- dst_rgb[2] = AVG(r, src_bayer1[1]);
- dst_rgb[3] = 255U;
- dst_rgb[4] = src_bayer0[0];
- dst_rgb[5] = src_bayer0[1];
- dst_rgb[6] = src_bayer1[1];
- dst_rgb[7] = 255U;
+ dst_argb += 8;
+ }
+ dst_argb[0] = src_bayer0[0];
+ dst_argb[1] = AVG(g, src_bayer0[1]);
+ dst_argb[2] = AVG(r, src_bayer1[1]);
+ dst_argb[3] = 255U;
+ if (!(pix & 1)) {
+ dst_argb[4] = src_bayer0[0];
+ dst_argb[5] = src_bayer0[1];
+ dst_argb[6] = src_bayer1[1];
+ dst_argb[7] = 255U;
+ }
}
static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer,
- uint8* dst_rgb, int pix) {
+ uint8* dst_argb, int pix) {
const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
uint8 g = src_bayer0[1];
uint8 b = src_bayer1[1];
- for (int x = 0; x < (pix - 2); x += 2) {
- dst_rgb[0] = AVG(b, src_bayer1[1]);
- dst_rgb[1] = AVG(g, src_bayer0[1]);
- dst_rgb[2] = src_bayer0[0];
- dst_rgb[3] = 255U;
- dst_rgb[4] = src_bayer1[1];
- dst_rgb[5] = src_bayer0[1];
- dst_rgb[6] = AVG(src_bayer0[0], src_bayer0[2]);
- dst_rgb[7] = 255U;
+ for (int x = 0; x < pix - 2; x += 2) {
+ dst_argb[0] = AVG(b, src_bayer1[1]);
+ dst_argb[1] = AVG(g, src_bayer0[1]);
+ dst_argb[2] = src_bayer0[0];
+ dst_argb[3] = 255U;
+ dst_argb[4] = src_bayer1[1];
+ dst_argb[5] = src_bayer0[1];
+ dst_argb[6] = AVG(src_bayer0[0], src_bayer0[2]);
+ dst_argb[7] = 255U;
g = src_bayer0[1];
b = src_bayer1[1];
src_bayer0 += 2;
src_bayer1 += 2;
- dst_rgb += 8;
- }
- dst_rgb[0] = AVG(b, src_bayer1[1]);
- dst_rgb[1] = AVG(g, src_bayer0[1]);
- dst_rgb[2] = src_bayer0[0];
- dst_rgb[3] = 255U;
- dst_rgb[4] = src_bayer1[1];
- dst_rgb[5] = src_bayer0[1];
- dst_rgb[6] = src_bayer0[0];
- dst_rgb[7] = 255U;
+ dst_argb += 8;
+ }
+ dst_argb[0] = AVG(b, src_bayer1[1]);
+ dst_argb[1] = AVG(g, src_bayer0[1]);
+ dst_argb[2] = src_bayer0[0];
+ dst_argb[3] = 255U;
+ if (!(pix & 1)) {
+ dst_argb[4] = src_bayer1[1];
+ dst_argb[5] = src_bayer0[1];
+ dst_argb[6] = src_bayer0[0];
+ dst_argb[7] = 255U;
+ }
}
static void BayerRowGB(const uint8* src_bayer0, int src_stride_bayer,
- uint8* dst_rgb, int pix) {
+ uint8* dst_argb, int pix) {
const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
uint8 b = src_bayer0[1];
- for (int x = 0; x < (pix - 2); x += 2) {
- dst_rgb[0] = AVG(b, src_bayer0[1]);
- dst_rgb[1] = src_bayer0[0];
- dst_rgb[2] = src_bayer1[0];
- dst_rgb[3] = 255U;
- dst_rgb[4] = src_bayer0[1];
- dst_rgb[5] = AVG(src_bayer0[0], src_bayer0[2]);
- dst_rgb[6] = AVG(src_bayer1[0], src_bayer1[2]);
- dst_rgb[7] = 255U;
+ for (int x = 0; x < pix - 2; x += 2) {
+ dst_argb[0] = AVG(b, src_bayer0[1]);
+ dst_argb[1] = src_bayer0[0];
+ dst_argb[2] = src_bayer1[0];
+ dst_argb[3] = 255U;
+ dst_argb[4] = src_bayer0[1];
+ dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]);
+ dst_argb[6] = AVG(src_bayer1[0], src_bayer1[2]);
+ dst_argb[7] = 255U;
b = src_bayer0[1];
src_bayer0 += 2;
src_bayer1 += 2;
- dst_rgb += 8;
- }
- dst_rgb[0] = AVG(b, src_bayer0[1]);
- dst_rgb[1] = src_bayer0[0];
- dst_rgb[2] = src_bayer1[0];
- dst_rgb[3] = 255U;
- dst_rgb[4] = src_bayer0[1];
- dst_rgb[5] = src_bayer0[0];
- dst_rgb[6] = src_bayer1[0];
- dst_rgb[7] = 255U;
+ dst_argb += 8;
+ }
+ dst_argb[0] = AVG(b, src_bayer0[1]);
+ dst_argb[1] = src_bayer0[0];
+ dst_argb[2] = src_bayer1[0];
+ dst_argb[3] = 255U;
+ if (!(pix & 1)) {
+ dst_argb[4] = src_bayer0[1];
+ dst_argb[5] = src_bayer0[0];
+ dst_argb[6] = src_bayer1[0];
+ dst_argb[7] = 255U;
+ }
}
static void BayerRowGR(const uint8* src_bayer0, int src_stride_bayer,
- uint8* dst_rgb, int pix) {
+ uint8* dst_argb, int pix) {
const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
uint8 r = src_bayer0[1];
- for (int x = 0; x < (pix - 2); x += 2) {
- dst_rgb[0] = src_bayer1[0];
- dst_rgb[1] = src_bayer0[0];
- dst_rgb[2] = AVG(r, src_bayer0[1]);
- dst_rgb[3] = 255U;
- dst_rgb[4] = AVG(src_bayer1[0], src_bayer1[2]);
- dst_rgb[5] = AVG(src_bayer0[0], src_bayer0[2]);
- dst_rgb[6] = src_bayer0[1];
- dst_rgb[7] = 255U;
+ for (int x = 0; x < pix - 2; x += 2) {
+ dst_argb[0] = src_bayer1[0];
+ dst_argb[1] = src_bayer0[0];
+ dst_argb[2] = AVG(r, src_bayer0[1]);
+ dst_argb[3] = 255U;
+ dst_argb[4] = AVG(src_bayer1[0], src_bayer1[2]);
+ dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]);
+ dst_argb[6] = src_bayer0[1];
+ dst_argb[7] = 255U;
r = src_bayer0[1];
src_bayer0 += 2;
src_bayer1 += 2;
- dst_rgb += 8;
- }
- dst_rgb[0] = src_bayer1[0];
- dst_rgb[1] = src_bayer0[0];
- dst_rgb[2] = AVG(r, src_bayer0[1]);
- dst_rgb[3] = 255U;
- dst_rgb[4] = src_bayer1[0];
- dst_rgb[5] = src_bayer0[0];
- dst_rgb[6] = src_bayer0[1];
- dst_rgb[7] = 255U;
+ dst_argb += 8;
+ }
+ dst_argb[0] = src_bayer1[0];
+ dst_argb[1] = src_bayer0[0];
+ dst_argb[2] = AVG(r, src_bayer0[1]);
+ dst_argb[3] = 255U;
+ if (!(pix & 1)) {
+ dst_argb[4] = src_bayer1[0];
+ dst_argb[5] = src_bayer0[0];
+ dst_argb[6] = src_bayer0[1];
+ dst_argb[7] = 255U;
+ }
}
// Converts any Bayer RGB format to ARGB.
-int BayerRGBToARGB(const uint8* src_bayer, int src_stride_bayer,
- uint32 src_fourcc_bayer,
- uint8* dst_rgb, int dst_stride_rgb,
- int width, int height) {
+LIBYUV_API
+int BayerToARGB(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height,
+ uint32 src_fourcc_bayer) {
if (height < 0) {
height = -height;
- dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb;
- dst_stride_rgb = -dst_stride_rgb;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
}
void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
- uint8* dst_rgb, int pix);
+ uint8* dst_argb, int pix);
void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
- uint8* dst_rgb, int pix);
-
+ uint8* dst_argb, int pix);
switch (src_fourcc_bayer) {
- default:
- assert(false);
- case FOURCC_RGGB:
- BayerRow0 = BayerRowRG;
- BayerRow1 = BayerRowGB;
- break;
case FOURCC_BGGR:
BayerRow0 = BayerRowBG;
BayerRow1 = BayerRowGR;
break;
+ case FOURCC_GBRG:
+ BayerRow0 = BayerRowGB;
+ BayerRow1 = BayerRowRG;
+ break;
case FOURCC_GRBG:
BayerRow0 = BayerRowGR;
BayerRow1 = BayerRowBG;
break;
- case FOURCC_GBRG:
- BayerRow0 = BayerRowGB;
- BayerRow1 = BayerRowRG;
+ case FOURCC_RGGB:
+ BayerRow0 = BayerRowRG;
+ BayerRow1 = BayerRowGB;
break;
+ default:
+ return -1; // Bad FourCC
}
- for (int y = 0; y < (height - 1); y += 2) {
- BayerRow0(src_bayer, src_stride_bayer, dst_rgb, width);
+ for (int y = 0; y < height - 1; y += 2) {
+ BayerRow0(src_bayer, src_stride_bayer, dst_argb, width);
BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
- dst_rgb + dst_stride_rgb, width);
+ dst_argb + dst_stride_argb, width);
src_bayer += src_stride_bayer * 2;
- dst_rgb += dst_stride_rgb * 2;
+ dst_argb += dst_stride_argb * 2;
}
if (height & 1) {
- BayerRow0(src_bayer, -src_stride_bayer, dst_rgb, width);
+ BayerRow0(src_bayer, -src_stride_bayer, dst_argb, width);
}
return 0;
}
// Converts any Bayer RGB format to ARGB.
-int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
- uint32 src_fourcc_bayer,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
+LIBYUV_API
+int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height,
+ uint32 src_fourcc_bayer) {
if (width * 4 > kMaxStride) {
- return -1;
+ return -1; // Size too large for row buffer
}
// Negative height means invert the image.
if (height < 0) {
@@ -346,60 +368,50 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
dst_stride_v = -dst_stride_v;
}
void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
- uint8* dst_rgb, int pix);
+ uint8* dst_argb, int pix);
void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
- uint8* dst_rgb, int pix);
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
+ uint8* dst_argb, int pix);
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+ ARGBToYRow_C;
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
#if defined(HAS_ARGBTOYROW_SSSE3)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
- (width % 16 == 0) &&
- IS_ALIGNED(row, 16) && (kMaxStride % 16 == 0) &&
- IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
+ if (TestCpuFlag(kCpuHasSSSE3) &&
+ IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
ARGBToYRow = ARGBToYRow_SSSE3;
- } else
-#endif
- {
- ARGBToYRow = ARGBToYRow_C;
}
+#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
- (width % 16 == 0) &&
- IS_ALIGNED(row, 16) && (kMaxStride % 16 == 0) &&
- IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
- IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
- } else
-#endif
- {
- ARGBToUVRow = ARGBToUVRow_C;
}
+#endif
switch (src_fourcc_bayer) {
- default:
- assert(false);
- case FOURCC_RGGB:
- BayerRow0 = BayerRowRG;
- BayerRow1 = BayerRowGB;
- break;
case FOURCC_BGGR:
BayerRow0 = BayerRowBG;
BayerRow1 = BayerRowGR;
break;
+ case FOURCC_GBRG:
+ BayerRow0 = BayerRowGB;
+ BayerRow1 = BayerRowRG;
+ break;
case FOURCC_GRBG:
BayerRow0 = BayerRowGR;
BayerRow1 = BayerRowBG;
break;
- case FOURCC_GBRG:
- BayerRow0 = BayerRowGB;
- BayerRow1 = BayerRowRG;
+ case FOURCC_RGGB:
+ BayerRow0 = BayerRowRG;
+ BayerRow1 = BayerRowGB;
break;
+ default:
+ return -1; // Bad FourCC
}
- for (int y = 0; y < (height - 1); y += 2) {
+ for (int y = 0; y < height - 1; y += 2) {
BayerRow0(src_bayer, src_stride_bayer, row, width);
BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
row + kMaxStride, width);
@@ -411,7 +423,6 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
dst_u += dst_stride_u;
dst_v += dst_stride_v;
}
- // TODO(fbarchard): Make sure this filters properly
if (height & 1) {
BayerRow0(src_bayer, src_stride_bayer, row, width);
ARGBToUVRow(row, 0, dst_u, dst_v, width);
@@ -420,4 +431,124 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
return 0;
}
+// Convert I420 to Bayer.
+LIBYUV_API
+int I420ToBayer(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_bayer, int dst_stride_bayer,
+ int width, int height,
+ uint32 dst_fourcc_bayer) {
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ int halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+ void (*I422ToARGBRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGBRow = I422ToARGBRow_NEON;
+ }
+#elif defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
+ }
+#endif
+ SIMD_ALIGNED(uint8 row[kMaxStride]);
+ void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
+ uint32 selector, int pix) = ARGBToBayerRow_C;
+#if defined(HAS_ARGBTOBAYERROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) {
+ ARGBToBayerRow = ARGBToBayerRow_SSSE3;
+ }
+#endif
+ const int blue_index = 0; // Offsets for ARGB format
+ const int green_index = 1;
+ const int red_index = 2;
+ uint32 index_map[2];
+ if (MakeSelectors(blue_index, green_index, red_index,
+ dst_fourcc_bayer, index_map)) {
+ return -1; // Bad FourCC
+ }
+
+ for (int y = 0; y < height; ++y) {
+ I422ToARGBRow(src_y, src_u, src_v, row, width);
+ ARGBToBayerRow(row, dst_bayer, index_map[y & 1], width);
+ dst_bayer += dst_stride_bayer;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+#define MAKEBAYERFOURCC(BAYER) \
+LIBYUV_API \
+int Bayer##BAYER##ToI420(const uint8* src_bayer, int src_stride_bayer, \
+ uint8* dst_y, int dst_stride_y, \
+ uint8* dst_u, int dst_stride_u, \
+ uint8* dst_v, int dst_stride_v, \
+ int width, int height) { \
+ return BayerToI420(src_bayer, src_stride_bayer, \
+ dst_y, dst_stride_y, \
+ dst_u, dst_stride_u, \
+ dst_v, dst_stride_v, \
+ width, height, \
+ FOURCC_##BAYER); \
+} \
+ \
+LIBYUV_API \
+int I420ToBayer##BAYER(const uint8* src_y, int src_stride_y, \
+ const uint8* src_u, int src_stride_u, \
+ const uint8* src_v, int src_stride_v, \
+ uint8* dst_bayer, int dst_stride_bayer, \
+ int width, int height) { \
+ return I420ToBayer(src_y, src_stride_y, \
+ src_u, src_stride_u, \
+ src_v, src_stride_v, \
+ dst_bayer, dst_stride_bayer, \
+ width, height, \
+ FOURCC_##BAYER); \
+} \
+ \
+LIBYUV_API \
+int ARGBToBayer##BAYER(const uint8* src_argb, int src_stride_argb, \
+ uint8* dst_bayer, int dst_stride_bayer, \
+ int width, int height) { \
+ return ARGBToBayer(src_argb, src_stride_argb, \
+ dst_bayer, dst_stride_bayer, \
+ width, height, \
+ FOURCC_##BAYER); \
+} \
+ \
+LIBYUV_API \
+int Bayer##BAYER##ToARGB(const uint8* src_bayer, int src_stride_bayer, \
+ uint8* dst_argb, int dst_stride_argb, \
+ int width, int height) { \
+ return BayerToARGB(src_bayer, src_stride_bayer, \
+ dst_argb, dst_stride_argb, \
+ width, height, \
+ FOURCC_##BAYER); \
+}
+
+MAKEBAYERFOURCC(BGGR)
+MAKEBAYERFOURCC(GBRG)
+MAKEBAYERFOURCC(GRBG)
+MAKEBAYERFOURCC(RGGB)
+
+#ifdef __cplusplus
+} // extern "C"
} // namespace libyuv
+#endif
diff --git a/files/source/general.cc b/files/source/general.cc
deleted file mode 100644
index 9d39f9bf..00000000
--- a/files/source/general.cc
+++ /dev/null
@@ -1,284 +0,0 @@
-/*
- * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/general.h"
-
-#include <string.h> // memcpy(), memset()
-
-#include "libyuv/planar_functions.h"
-
-namespace libyuv {
-
-int
-I420Mirror(const uint8* src_yplane, int src_ystride,
- const uint8* src_uplane, int src_ustride,
- const uint8* src_vplane, int src_vstride,
- uint8* dst_yplane, int dst_ystride,
- uint8* dst_uplane, int dst_ustride,
- uint8* dst_vplane, int dst_vstride,
- int width, int height) {
- if (src_yplane == NULL || src_uplane == NULL || src_vplane == NULL ||
- dst_yplane == NULL || dst_uplane == NULL || dst_vplane == NULL) {
- return -1;
- }
-
- int indO = 0;
- int indS = 0;
- int wind, hind;
- uint8 tmpVal, tmpValU, tmpValV;
- // Will swap two values per iteration
- const int halfWidth = (width + 1) >> 1;
-
- // Y
- for (wind = 0; wind < halfWidth; wind++) {
- for (hind = 0; hind < height; hind++) {
- indO = hind * src_ystride + wind;
- indS = hind * dst_ystride + (width - wind - 1);
- tmpVal = src_yplane[indO];
- dst_yplane[indO] = src_yplane[indS];
- dst_yplane[indS] = tmpVal;
- }
- }
-
- const int halfHeight = (height + 1) >> 1;
- const int halfSrcuvStride = (height + 1) >> 1;
- const int halfuvWidth = (width + 1) >> 2;
-
- for (wind = 0; wind < halfuvWidth; wind++) {
- for (hind = 0; hind < halfHeight; hind++) {
- indO = hind * halfSrcuvStride + wind;
- indS = hind * halfSrcuvStride + (halfuvWidth - wind - 1);
- // U
- tmpValU = src_uplane[indO];
- dst_uplane[indO] = src_uplane[indS];
- dst_uplane[indS] = tmpValU;
- // V
- tmpValV = src_vplane[indO];
- dst_vplane[indO] = src_vplane[indS];
- dst_vplane[indS] = tmpValV;
- }
- }
- return 0;
-}
-
-// Make a center cut
-int
-I420Crop(uint8* frame,
- int src_width, int src_height,
- int dst_width, int dst_height)
-{
- if (frame == NULL)
- return -1;
-
- if (src_width == dst_width && src_height == dst_height) {
- // Nothing to do
- return 3 * dst_height * dst_width / 2;
- }
- if (dst_width > src_width || dst_height > src_height) {
- // error
- return -1;
- }
- int i = 0;
- int m = 0;
- int loop = 0;
- int half_dst_width = dst_width / 2;
- int halfdst_height = dst_height / 2;
- int halfsrc_width = src_width / 2;
- int half_dst_height= src_height / 2;
- int crop_height = ( src_height - dst_height ) / 2;
- int crop_width = ( src_width - dst_width ) / 2;
-
- for (i = src_width * crop_height + crop_width; loop < dst_height ;
- loop++, i += src_width) {
- memcpy(&frame[m],&frame[i],dst_width);
- m += dst_width;
- }
- i = src_width * src_height; // ilum
- loop = 0;
- for ( i += (halfsrc_width * crop_height / 2 + crop_width / 2);
- loop < halfdst_height; loop++,i += halfsrc_width) {
- memcpy(&frame[m],&frame[i],half_dst_width);
- m += half_dst_width;
- }
- loop = 0;
- i = src_width * src_height + half_dst_height * halfsrc_width; // ilum + Cr
- for ( i += (halfsrc_width * crop_height / 2 + crop_width / 2);
- loop < halfdst_height; loop++, i += halfsrc_width) {
- memcpy(&frame[m],&frame[i],half_dst_width);
- m += half_dst_width;
- }
- return 0;
-}
-
-
-int
-I420CropPad(const uint8* src_frame, int src_width,
- int src_height, uint8* dst_frame,
- int dst_width, int dst_height)
-{
- if (src_width < 1 || dst_width < 1 || src_height < 1 || dst_height < 1) {
- return -1;
- }
- if (src_width == dst_width && src_height == dst_height) {
- memcpy(dst_frame, src_frame, 3 * dst_width * (dst_height >> 1));
- } else {
- if (src_height < dst_height) {
- // pad height
- int pad_height = dst_height - src_height;
- int i = 0;
- int pad_width = 0;
- int crop_width = 0;
- int width = src_width;
- if (src_width < dst_width) {
- // pad width
- pad_width = dst_width - src_width;
- } else {
- // cut width
- crop_width = src_width - dst_width;
- width = dst_width;
- }
- if (pad_height) {
- memset(dst_frame, 0, dst_width * (pad_height >> 1));
- dst_frame += dst_width * (pad_height >> 1);
- }
- for (i = 0; i < src_height;i++) {
- if (pad_width) {
- memset(dst_frame, 0, pad_width / 2);
- dst_frame += pad_width / 2;
- }
- src_frame += crop_width >> 1; // in case we have a cut
- memcpy(dst_frame,src_frame ,width);
- src_frame += crop_width >> 1;
- dst_frame += width;
- src_frame += width;
- if (pad_width) {
- memset(dst_frame, 0, pad_width / 2);
- dst_frame += pad_width / 2;
- }
- }
- if (pad_height) {
- memset(dst_frame, 0, dst_width * (pad_height >> 1));
- dst_frame += dst_width * (pad_height >> 1);
- }
- if (pad_height) {
- memset(dst_frame, 127, (dst_width >> 2) * (pad_height >> 1));
- dst_frame += (dst_width >> 2) * (pad_height >> 1);
- }
- for (i = 0; i < (src_height >> 1); i++) {
- if (pad_width) {
- memset(dst_frame, 127, pad_width >> 2);
- dst_frame += pad_width >> 2;
- }
- src_frame += crop_width >> 2; // in case we have a cut
- memcpy(dst_frame, src_frame,width >> 1);
- src_frame += crop_width >> 2;
- dst_frame += width >> 1;
- src_frame += width >> 1;
- if (pad_width) {
- memset(dst_frame, 127, pad_width >> 2);
- dst_frame += pad_width >> 2;
- }
- }
- if (pad_height) {
- memset(dst_frame, 127, (dst_width >> 1) * (pad_height >> 1));
- dst_frame += (dst_width >> 1) * (pad_height >> 1);
- }
- for (i = 0; i < (src_height >> 1); i++) {
- if (pad_width) {
- memset(dst_frame, 127, pad_width >> 2);
- dst_frame += pad_width >> 2;
- }
- src_frame += crop_width >> 2; // in case we have a cut
- memcpy(dst_frame, src_frame,width >> 1);
- src_frame += crop_width >> 2;
- dst_frame += width >> 1;
- src_frame += width >> 1;
- if (pad_width) {
- memset(dst_frame, 127, pad_width >> 2);
- dst_frame += pad_width >> 2;
- }
- }
- if (pad_height) {
- memset(dst_frame, 127, (dst_width >> 2) * (pad_height >> 1));
- dst_frame += (dst_width >> 2) * (pad_height >> 1);
- }
- } else {
- // cut height
- int i = 0;
- int pad_width = 0;
- int crop_width = 0;
- int width = src_width;
-
- if (src_width < dst_width) {
- // pad width
- pad_width = dst_width - src_width;
- } else {
- // cut width
- crop_width = src_width - dst_width;
- width = dst_width;
- }
- int diff_height = src_height - dst_height;
- src_frame += src_width * (diff_height >> 1); // skip top I
-
- for (i = 0; i < dst_height; i++) {
- if (pad_width) {
- memset(dst_frame, 0, pad_width / 2);
- dst_frame += pad_width / 2;
- }
- src_frame += crop_width >> 1; // in case we have a cut
- memcpy(dst_frame,src_frame ,width);
- src_frame += crop_width >> 1;
- dst_frame += width;
- src_frame += width;
- if (pad_width) {
- memset(dst_frame, 0, pad_width / 2);
- dst_frame += pad_width / 2;
- }
- }
- src_frame += src_width * (diff_height >> 1); // skip end I
- src_frame += (src_width >> 2) * (diff_height >> 1); // skip top of Cr
- for (i = 0; i < (dst_height >> 1); i++) {
- if (pad_width) {
- memset(dst_frame, 127, pad_width >> 2);
- dst_frame += pad_width >> 2;
- }
- src_frame += crop_width >> 2; // in case we have a cut
- memcpy(dst_frame, src_frame,width >> 1);
- src_frame += crop_width >> 2;
- dst_frame += width >> 1;
- src_frame += width >> 1;
- if (pad_width) {
- memset(dst_frame, 127, pad_width >> 2);
- dst_frame += pad_width >> 2;
- }
- }
- src_frame += (src_width >> 2) * (diff_height >> 1); // skip end of Cr
- src_frame += (src_width >> 2) * (diff_height >> 1); // skip top of Cb
- for (i = 0; i < (dst_height >> 1); i++) {
- if (pad_width) {
- memset(dst_frame, 127, pad_width >> 2);
- dst_frame += pad_width >> 2;
- }
- src_frame += crop_width >> 2; // in case we have a cut
- memcpy(dst_frame, src_frame, width >> 1);
- src_frame += crop_width >> 2;
- dst_frame += width >> 1;
- src_frame += width >> 1;
- if (pad_width) {
- memset(dst_frame, 127, pad_width >> 2);
- dst_frame += pad_width >> 2;
- }
- }
- }
- }
- return 0;
-}
-
-} // namespace libyuv
diff --git a/files/source/mjpeg_decoder.cc b/files/source/mjpeg_decoder.cc
new file mode 100644
index 00000000..aa603947
--- /dev/null
+++ b/files/source/mjpeg_decoder.cc
@@ -0,0 +1,583 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/mjpeg_decoder.h"
+
+#ifdef HAVE_JPEG
+// Must be included before jpeglib
+#include <assert.h>
+#ifndef __CLR_VER
+#include <setjmp.h>
+#define HAVE_SETJMP
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+
+extern "C" {
+#include <jpeglib.h>
+}
+
+#include <climits>
+#include <cstring>
+
+namespace libyuv {
+
+#ifdef HAVE_SETJMP
+struct SetJmpErrorMgr {
+ jpeg_error_mgr base; // Must be at the top
+ jmp_buf setjmp_buffer;
+};
+#endif
+
+const int MJpegDecoder::kColorSpaceUnknown = JCS_UNKNOWN;
+const int MJpegDecoder::kColorSpaceGrayscale = JCS_GRAYSCALE;
+const int MJpegDecoder::kColorSpaceRgb = JCS_RGB;
+const int MJpegDecoder::kColorSpaceYCbCr = JCS_YCbCr;
+const int MJpegDecoder::kColorSpaceCMYK = JCS_CMYK;
+const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK;
+
+MJpegDecoder::MJpegDecoder()
+ : has_scanline_padding_(false),
+ num_outbufs_(0),
+ scanlines_(NULL),
+ scanlines_sizes_(NULL),
+ databuf_(NULL),
+ databuf_strides_(NULL) {
+ decompress_struct_ = new jpeg_decompress_struct;
+ source_mgr_ = new jpeg_source_mgr;
+#ifdef HAVE_SETJMP
+ error_mgr_ = new SetJmpErrorMgr;
+ decompress_struct_->err = jpeg_std_error(&error_mgr_->base);
+ // Override standard exit()-based error handler.
+ error_mgr_->base.error_exit = &ErrorHandler;
+#endif
+ decompress_struct_->client_data = NULL;
+ source_mgr_->init_source = &init_source;
+ source_mgr_->fill_input_buffer = &fill_input_buffer;
+ source_mgr_->skip_input_data = &skip_input_data;
+ source_mgr_->resync_to_restart = &jpeg_resync_to_restart;
+ source_mgr_->term_source = &term_source;
+ jpeg_create_decompress(decompress_struct_);
+ decompress_struct_->src = source_mgr_;
+ buf_vec_.buffers = &buf_;
+ buf_vec_.len = 1;
+}
+
+MJpegDecoder::~MJpegDecoder() {
+ jpeg_destroy_decompress(decompress_struct_);
+ delete decompress_struct_;
+ delete source_mgr_;
+#ifdef HAVE_SETJMP
+ delete error_mgr_;
+#endif
+ DestroyOutputBuffers();
+}
+
+// Helper function to validate the jpeg looks ok.
+// TODO(fbarchard): Improve performance. Scan backward for EOI?
+bool ValidateJpeg(const uint8* sample, size_t sample_size) {
+ if (sample_size < 64) {
+ // ERROR: Invalid jpeg size: sample_size
+ return false;
+ }
+ if (sample[0] != 0xff || sample[1] != 0xd8) {
+ // ERROR: Invalid jpeg initial start code
+ return false;
+ }
+ bool soi = true;
+ int total_eoi = 0;
+ for (int i = 2; i < static_cast<int>(sample_size) - 1; ++i) {
+ if (sample[i] == 0xff) {
+ if (sample[i + 1] == 0xd8) { // Start Of Image
+ soi = true;
+ } else if (sample[i + 1] == 0xd9) { // End Of Image
+ if (soi) {
+ ++total_eoi;
+ }
+ soi = false;
+ }
+ }
+ }
+ if (!total_eoi) {
+ // ERROR: Invalid jpeg end code not found. Size sample_size
+ return false;
+ }
+ return true;
+}
+
+bool MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
+ if (!ValidateJpeg(src, src_len)) {
+ return false;
+ }
+
+ buf_.data = src;
+ buf_.len = static_cast<int>(src_len);
+ buf_vec_.pos = 0;
+ decompress_struct_->client_data = &buf_vec_;
+#ifdef HAVE_SETJMP
+ if (setjmp(error_mgr_->setjmp_buffer)) {
+ // We called jpeg_read_header, it experienced an error, and we called
+ // longjmp() and rewound the stack to here. Return error.
+ return false;
+ }
+#endif
+ if (jpeg_read_header(decompress_struct_, TRUE) != JPEG_HEADER_OK) {
+ // ERROR: Bad MJPEG header
+ return false;
+ }
+ AllocOutputBuffers(GetNumComponents());
+ for (int i = 0; i < num_outbufs_; ++i) {
+ int scanlines_size = GetComponentScanlinesPerImcuRow(i);
+ if (scanlines_sizes_[i] != scanlines_size) {
+ if (scanlines_[i]) {
+ delete scanlines_[i];
+ }
+ scanlines_[i] = new uint8* [scanlines_size];
+ scanlines_sizes_[i] = scanlines_size;
+ }
+
+ // We allocate padding for the final scanline to pad it up to DCTSIZE bytes
+ // to avoid memory errors, since jpeglib only reads full MCUs blocks. For
+ // the preceding scanlines, the padding is not needed/wanted because the
+ // following addresses will already be valid (they are the initial bytes of
+ // the next scanline) and will be overwritten when jpeglib writes out that
+ // next scanline.
+ int databuf_stride = GetComponentStride(i);
+ int databuf_size = scanlines_size * databuf_stride;
+ if (databuf_strides_[i] != databuf_stride) {
+ if (databuf_[i]) {
+ delete databuf_[i];
+ }
+ databuf_[i] = new uint8[databuf_size];
+ databuf_strides_[i] = databuf_stride;
+ }
+
+ if (GetComponentStride(i) != GetComponentWidth(i)) {
+ has_scanline_padding_ = true;
+ }
+ }
+ return true;
+}
+
+static int DivideAndRoundUp(int numerator, int denominator) {
+ return (numerator + denominator - 1) / denominator;
+}
+
+static int DivideAndRoundDown(int numerator, int denominator) {
+ return numerator / denominator;
+}
+
+// Returns width of the last loaded frame.
+int MJpegDecoder::GetWidth() {
+ return decompress_struct_->image_width;
+}
+
+// Returns height of the last loaded frame.
+int MJpegDecoder::GetHeight() {
+ return decompress_struct_->image_height;
+}
+
+// Returns format of the last loaded frame. The return value is one of the
+// kColorSpace* constants.
+int MJpegDecoder::GetColorSpace() {
+ return decompress_struct_->jpeg_color_space;
+}
+
+// Number of color components in the color space.
+int MJpegDecoder::GetNumComponents() {
+ return decompress_struct_->num_components;
+}
+
+// Sample factors of the n-th component.
+int MJpegDecoder::GetHorizSampFactor(int component) {
+ return decompress_struct_->comp_info[component].h_samp_factor;
+}
+
+int MJpegDecoder::GetVertSampFactor(int component) {
+ return decompress_struct_->comp_info[component].v_samp_factor;
+}
+
+int MJpegDecoder::GetHorizSubSampFactor(int component) {
+ return decompress_struct_->max_h_samp_factor /
+ GetHorizSampFactor(component);
+}
+
+int MJpegDecoder::GetVertSubSampFactor(int component) {
+ return decompress_struct_->max_v_samp_factor /
+ GetVertSampFactor(component);
+}
+
+int MJpegDecoder::GetImageScanlinesPerImcuRow() {
+ return decompress_struct_->max_v_samp_factor * DCTSIZE;
+}
+
+int MJpegDecoder::GetComponentScanlinesPerImcuRow(int component) {
+ int vs = GetVertSubSampFactor(component);
+ return DivideAndRoundUp(GetImageScanlinesPerImcuRow(), vs);
+}
+
+int MJpegDecoder::GetComponentWidth(int component) {
+ int hs = GetHorizSubSampFactor(component);
+ return DivideAndRoundUp(GetWidth(), hs);
+}
+
+int MJpegDecoder::GetComponentHeight(int component) {
+ int vs = GetVertSubSampFactor(component);
+ return DivideAndRoundUp(GetHeight(), vs);
+}
+
+// Get width in bytes padded out to a multiple of DCTSIZE
+int MJpegDecoder::GetComponentStride(int component) {
+ return (GetComponentWidth(component) + DCTSIZE - 1) & ~(DCTSIZE - 1);
+}
+
+int MJpegDecoder::GetComponentSize(int component) {
+ return GetComponentWidth(component) * GetComponentHeight(component);
+}
+
+bool MJpegDecoder::UnloadFrame() {
+#ifdef HAVE_SETJMP
+ if (setjmp(error_mgr_->setjmp_buffer)) {
+ // We called jpeg_abort_decompress, it experienced an error, and we called
+ // longjmp() and rewound the stack to here. Return error.
+ return false;
+ }
+#endif
+ jpeg_abort_decompress(decompress_struct_);
+ return true;
+}
+
+static void CopyRows(uint8* source, int source_stride,
+ uint8* dest, int pixels, int numrows) {
+ for (int i = 0; i < numrows; ++i) {
+ memcpy(dest, source, pixels);
+ dest += pixels;
+ source += source_stride;
+ }
+}
+
+// TODO(fbarchard): Allow rectangle to be specified: x, y, width, height.
+bool MJpegDecoder::DecodeToBuffers(
+ uint8** planes, int dst_width, int dst_height) {
+ if (dst_width != GetWidth() ||
+ dst_height > GetHeight()) {
+ // ERROR: Bad dimensions
+ return false;
+ }
+#ifdef HAVE_SETJMP
+ if (setjmp(error_mgr_->setjmp_buffer)) {
+ // We called into jpeglib, it experienced an error sometime during this
+ // function call, and we called longjmp() and rewound the stack to here.
+ // Return error.
+ return false;
+ }
+#endif
+ if (!StartDecode()) {
+ return false;
+ }
+ SetScanlinePointers(databuf_);
+ int lines_left = dst_height;
+ // Compute amount of lines to skip to implement vertical crop.
+ // TODO(fbarchard): Ensure skip is a multiple of maximum component
+ // subsample. ie 2
+ int skip = (GetHeight() - dst_height) / 2;
+ if (skip > 0) {
+ // There is no API to skip lines in the output data, so we read them
+ // into the temp buffer.
+ while (skip >= GetImageScanlinesPerImcuRow()) {
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return false;
+ }
+ skip -= GetImageScanlinesPerImcuRow();
+ }
+ if (skip > 0) {
+ // Have a partial iMCU row left over to skip. Must read it and then
+ // copy the parts we want into the destination.
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return false;
+ }
+ for (int i = 0; i < num_outbufs_; ++i) {
+ // TODO(fbarchard): Compute skip to avoid this
+ assert(skip % GetVertSubSampFactor(i) == 0);
+ int rows_to_skip =
+ DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+ int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i) -
+ rows_to_skip;
+ int data_to_skip = rows_to_skip * GetComponentStride(i);
+ CopyRows(databuf_[i] + data_to_skip, GetComponentStride(i),
+ planes[i], GetComponentWidth(i), scanlines_to_copy);
+ planes[i] += scanlines_to_copy * GetComponentWidth(i);
+ }
+ lines_left -= (GetImageScanlinesPerImcuRow() - skip);
+ }
+ }
+
+ // Read full MCUs but cropped horizontally
+ for (; lines_left > GetImageScanlinesPerImcuRow();
+ lines_left -= GetImageScanlinesPerImcuRow()) {
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return false;
+ }
+ for (int i = 0; i < num_outbufs_; ++i) {
+ int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i);
+ CopyRows(databuf_[i], GetComponentStride(i),
+ planes[i], GetComponentWidth(i), scanlines_to_copy);
+ planes[i] += scanlines_to_copy * GetComponentWidth(i);
+ }
+ }
+
+ if (lines_left > 0) {
+ // Have a partial iMCU row left over to decode.
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return false;
+ }
+ for (int i = 0; i < num_outbufs_; ++i) {
+ int scanlines_to_copy =
+ DivideAndRoundUp(lines_left, GetVertSubSampFactor(i));
+ CopyRows(databuf_[i], GetComponentStride(i),
+ planes[i], GetComponentWidth(i), scanlines_to_copy);
+ planes[i] += scanlines_to_copy * GetComponentWidth(i);
+ }
+ }
+ return FinishDecode();
+}
+
+bool MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
+ int dst_width, int dst_height) {
+ if (dst_width != GetWidth() ||
+ dst_height > GetHeight()) {
+ // ERROR: Bad dimensions
+ return false;
+ }
+#ifdef HAVE_SETJMP
+ if (setjmp(error_mgr_->setjmp_buffer)) {
+ // We called into jpeglib, it experienced an error sometime during this
+ // function call, and we called longjmp() and rewound the stack to here.
+ // Return error.
+ return false;
+ }
+#endif
+ if (!StartDecode()) {
+ return false;
+ }
+ SetScanlinePointers(databuf_);
+ int lines_left = dst_height;
+ // TODO(fbarchard): Compute amount of lines to skip to implement vertical crop
+ int skip = (GetHeight() - dst_height) / 2;
+ if (skip > 0) {
+ while (skip >= GetImageScanlinesPerImcuRow()) {
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return false;
+ }
+ skip -= GetImageScanlinesPerImcuRow();
+ }
+ if (skip > 0) {
+ // Have a partial iMCU row left over to skip.
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return false;
+ }
+ for (int i = 0; i < num_outbufs_; ++i) {
+ // TODO(fbarchard): Compute skip to avoid this
+ assert(skip % GetVertSubSampFactor(i) == 0);
+ int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+ int data_to_skip = rows_to_skip * GetComponentStride(i);
+ // Change our own data buffer pointers so we can pass them to the
+ // callback.
+ databuf_[i] += data_to_skip;
+ }
+ int scanlines_to_copy = GetImageScanlinesPerImcuRow() - skip;
+ (*fn)(opaque, databuf_, databuf_strides_, scanlines_to_copy);
+ // Now change them back.
+ for (int i = 0; i < num_outbufs_; ++i) {
+ int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+ int data_to_skip = rows_to_skip * GetComponentStride(i);
+ databuf_[i] -= data_to_skip;
+ }
+ lines_left -= scanlines_to_copy;
+ }
+ }
+ // Read full MCUs until we get to the crop point.
+ for (; lines_left >= GetImageScanlinesPerImcuRow();
+ lines_left -= GetImageScanlinesPerImcuRow()) {
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return false;
+ }
+ (*fn)(opaque, databuf_, databuf_strides_, GetImageScanlinesPerImcuRow());
+ }
+ if (lines_left > 0) {
+ // Have a partial iMCU row left over to decode.
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return false;
+ }
+ (*fn)(opaque, databuf_, databuf_strides_, lines_left);
+ }
+ return FinishDecode();
+}
+
+void MJpegDecoder::init_source(j_decompress_ptr cinfo) {
+ fill_input_buffer(cinfo);
+}
+
+boolean MJpegDecoder::fill_input_buffer(j_decompress_ptr cinfo) {
+ BufferVector* buf_vec = static_cast<BufferVector*>(cinfo->client_data);
+ if (buf_vec->pos >= buf_vec->len) {
+ assert(0 && "No more data");
+ // ERROR: No more data
+ return FALSE;
+ }
+ cinfo->src->next_input_byte = buf_vec->buffers[buf_vec->pos].data;
+ cinfo->src->bytes_in_buffer = buf_vec->buffers[buf_vec->pos].len;
+ ++buf_vec->pos;
+ return TRUE;
+}
+
+void MJpegDecoder::skip_input_data(j_decompress_ptr cinfo,
+ long num_bytes) { // NOLINT
+ cinfo->src->next_input_byte += num_bytes;
+}
+
+void MJpegDecoder::term_source(j_decompress_ptr cinfo) {
+ // Nothing to do.
+}
+
+#ifdef HAVE_SETJMP
+void MJpegDecoder::ErrorHandler(j_common_ptr cinfo) {
+ // This is called when a jpeglib command experiences an error. Unfortunately
+ // jpeglib's error handling model is not very flexible, because it expects the
+ // error handler to not return--i.e., it wants the program to terminate. To
+ // recover from errors we use setjmp() as shown in their example. setjmp() is
+ // C's implementation for the "call with current continuation" functionality
+ // seen in some functional programming languages.
+ char buf[JMSG_LENGTH_MAX];
+ (*cinfo->err->format_message)(cinfo, buf);
+ // ERROR: Error in jpeglib: buf
+
+ SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);
+ // This rewinds the call stack to the point of the corresponding setjmp()
+ // and causes it to return (for a second time) with value 1.
+ longjmp(mgr->setjmp_buffer, 1);
+}
+#endif
+
+void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
+ if (num_outbufs != num_outbufs_) {
+ // We could perhaps optimize this case to resize the output buffers without
+ // necessarily having to delete and recreate each one, but it's not worth
+ // it.
+ DestroyOutputBuffers();
+
+ scanlines_ = new uint8** [num_outbufs];
+ scanlines_sizes_ = new int[num_outbufs];
+ databuf_ = new uint8* [num_outbufs];
+ databuf_strides_ = new int[num_outbufs];
+
+ for (int i = 0; i < num_outbufs; ++i) {
+ scanlines_[i] = NULL;
+ scanlines_sizes_[i] = 0;
+ databuf_[i] = NULL;
+ databuf_strides_[i] = 0;
+ }
+
+ num_outbufs_ = num_outbufs;
+ }
+}
+
+void MJpegDecoder::DestroyOutputBuffers() {
+ for (int i = 0; i < num_outbufs_; ++i) {
+ delete [] scanlines_[i];
+ delete [] databuf_[i];
+ }
+ delete [] scanlines_;
+ delete [] databuf_;
+ delete [] scanlines_sizes_;
+ delete [] databuf_strides_;
+ scanlines_ = NULL;
+ databuf_ = NULL;
+ scanlines_sizes_ = NULL;
+ databuf_strides_ = NULL;
+ num_outbufs_ = 0;
+}
+
+// JDCT_IFAST and do_block_smoothing improve performance substantially.
+bool MJpegDecoder::StartDecode() {
+ decompress_struct_->raw_data_out = TRUE;
+ decompress_struct_->dct_method = JDCT_IFAST; // JDCT_ISLOW is default
+ decompress_struct_->dither_mode = JDITHER_NONE;
+ decompress_struct_->do_fancy_upsampling = false; // Not applicable to 'raw'
+ decompress_struct_->enable_2pass_quant = false; // Only for buffered mode
+ decompress_struct_->do_block_smoothing = false; // blocky but fast
+
+ if (!jpeg_start_decompress(decompress_struct_)) {
+ // ERROR: Couldn't start JPEG decompressor";
+ return false;
+ }
+ return true;
+}
+
+bool MJpegDecoder::FinishDecode() {
+ // jpeglib considers it an error if we finish without decoding the whole
+ // image, so we call "abort" rather than "finish".
+ jpeg_abort_decompress(decompress_struct_);
+ return true;
+}
+
+void MJpegDecoder::SetScanlinePointers(uint8** data) {
+ for (int i = 0; i < num_outbufs_; ++i) {
+ uint8* data_i = data[i];
+ for (int j = 0; j < scanlines_sizes_[i]; ++j) {
+ scanlines_[i][j] = data_i;
+ data_i += GetComponentStride(i);
+ }
+ }
+}
+
+inline bool MJpegDecoder::DecodeImcuRow() {
+ return static_cast<unsigned int>(GetImageScanlinesPerImcuRow()) ==
+ jpeg_read_raw_data(decompress_struct_,
+ scanlines_,
+ GetImageScanlinesPerImcuRow());
+}
+
+// The helper function which recognizes the jpeg sub-sampling type.
+JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
+ int* subsample_x, int* subsample_y, int number_of_components) {
+ if (number_of_components == 3) { // Color images.
+ if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
+ subsample_x[1] == 2 && subsample_y[1] == 2 &&
+ subsample_x[2] == 2 && subsample_y[2] == 2) {
+ return kJpegYuv420;
+ } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
+ subsample_x[1] == 2 && subsample_y[1] == 1 &&
+ subsample_x[2] == 2 && subsample_y[2] == 1) {
+ return kJpegYuv422;
+ } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
+ subsample_x[1] == 1 && subsample_y[1] == 1 &&
+ subsample_x[2] == 1 && subsample_y[2] == 1) {
+ return kJpegYuv444;
+ }
+ } else if (number_of_components == 1) { // Grey-scale images.
+ if (subsample_x[0] == 1 && subsample_y[0] == 1) {
+ return kJpegYuv400;
+ }
+ }
+ return kJpegUnknown;
+}
+
+} // namespace libyuv
+#endif // HAVE_JPEG
+
diff --git a/files/source/planar_functions.cc b/files/source/planar_functions.cc
index a7e3e38a..a7f5086a 100644
--- a/files/source/planar_functions.cc
+++ b/files/source/planar_functions.cc
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@@ -10,321 +10,104 @@
#include "libyuv/planar_functions.h"
-#include <string.h>
+#include <string.h> // for memset()
#include "libyuv/cpu_id.h"
-#include "row.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/row.h"
+#ifdef __cplusplus
namespace libyuv {
-
-#if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED)
-#define HAS_SPLITUV_NEON
-// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v
-// Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels.
-static void SplitUV_NEON(const uint8* src_uv,
- uint8* dst_u, uint8* dst_v, int pix) {
- __asm__ volatile
- (
- "1:\n"
- "vld2.u8 {q0,q1}, [%0]! \n" // load 16 pairs of UV
- "vst1.u8 {q0}, [%1]! \n" // store U
- "vst1.u8 {q1}, [%2]! \n" // Store V
- "subs %3, %3, #16 \n" // 16 processed per loop
- "bhi 1b \n"
- : "+r"(src_uv),
- "+r"(dst_u),
- "+r"(dst_v),
- "+r"(pix) // Output registers
- : // Input registers
- : "q0", "q1" // Clobber List
- );
-}
-
-#elif (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \
- && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
-#if defined(_MSC_VER)
-#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
-#else
-#define TALIGN16(t, var) t var __attribute__((aligned(16)))
+extern "C" {
#endif
-// Shuffle table for converting ABGR to ARGB.
-extern "C" TALIGN16(const uint8, kShuffleMaskABGRToARGB[16]) = {
- 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
-};
-
-// Shuffle table for converting BGRA to ARGB.
-extern "C" TALIGN16(const uint8, kShuffleMaskBGRAToARGB[16]) = {
- 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
-};
-
-#if defined(WIN32) && !defined(COVERAGE_ENABLED)
-#define HAS_SPLITUV_SSE2
-__declspec(naked)
-static void SplitUV_SSE2(const uint8* src_uv,
- uint8* dst_u, uint8* dst_v, int pix) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_uv
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // pix
- pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
- psrlw xmm7, 8
-
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- pand xmm0, xmm7 // even bytes
- pand xmm1, xmm7
- packuswb xmm0, xmm1
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- psrlw xmm2, 8 // odd bytes
- psrlw xmm3, 8
- packuswb xmm2, xmm3
- movdqa [edi], xmm2
- lea edi, [edi + 16]
- sub ecx, 16
- ja wloop
- pop edi
- ret
+// Copy a plane of data
+LIBYUV_API
+void CopyPlane(const uint8* src_y, int src_stride_y,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height) {
+ void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_COPYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) {
+ CopyRow = CopyRow_NEON;
}
-}
-
-#elif (defined(__x86_64__) || defined(__i386__)) && \
- !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
-#define HAS_SPLITUV_SSE2
-static void SplitUV_SSE2(const uint8* src_uv,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile(
- "pcmpeqb %%xmm7,%%xmm7\n"
- "psrlw $0x8,%%xmm7\n"
-"1:"
- "movdqa (%0),%%xmm0\n"
- "movdqa 0x10(%0),%%xmm1\n"
- "lea 0x20(%0),%0\n"
- "movdqa %%xmm0,%%xmm2\n"
- "movdqa %%xmm1,%%xmm3\n"
- "pand %%xmm7,%%xmm0\n"
- "pand %%xmm7,%%xmm1\n"
- "packuswb %%xmm1,%%xmm0\n"
- "movdqa %%xmm0,(%1)\n"
- "lea 0x10(%1),%1\n"
- "psrlw $0x8,%%xmm2\n"
- "psrlw $0x8,%%xmm3\n"
- "packuswb %%xmm3,%%xmm2\n"
- "movdqa %%xmm2,(%2)\n"
- "lea 0x10(%2),%2\n"
- "sub $0x10,%3\n"
- "ja 1b\n"
- : "+r"(src_uv), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(pix) // %3
- :
- : "memory"
-);
-}
#endif
+#if defined(HAS_COPYROW_X86)
+ if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
+ CopyRow = CopyRow_X86;
+ }
#endif
-
-static void SplitUV_C(const uint8* src_uv,
- uint8* dst_u, uint8* dst_v, int pix) {
- // Copy a row of UV.
- for (int x = 0; x < pix; ++x) {
- dst_u[0] = src_uv[0];
- dst_v[0] = src_uv[1];
- src_uv += 2;
- dst_u += 1;
- dst_v += 1;
+#if defined(HAS_COPYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
+ IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
+ IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ CopyRow = CopyRow_SSE2;
}
-}
+#endif
-static void I420CopyPlane(const uint8* src_y, int src_stride_y,
- uint8* dst_y, int dst_stride_y,
- int width, int height) {
// Copy plane
for (int y = 0; y < height; ++y) {
- memcpy(dst_y, src_y, width);
+ CopyRow(src_y, dst_y, width);
src_y += src_stride_y;
dst_y += dst_stride_y;
}
}
-// Copy I420 with optional flipping
-int I420Copy(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- if (!src_y || !src_u || !src_v ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
+// Convert I420 to I400.
+LIBYUV_API
+int I420ToI400(const uint8* src_y, int src_stride_y,
+ uint8*, int, // src_u
+ uint8*, int, // src_v
+ uint8* dst_y, int dst_stride_y,
+ int width, int height) {
+ if (!src_y || !dst_y || width <= 0 || height == 0) {
return -1;
}
-
// Negative height means invert the image.
if (height < 0) {
height = -height;
- int halfheight = (height + 1) >> 1;
src_y = src_y + (height - 1) * src_stride_y;
- src_u = src_u + (halfheight - 1) * src_stride_u;
- src_v = src_v + (halfheight - 1) * src_stride_v;
src_stride_y = -src_stride_y;
- src_stride_u = -src_stride_u;
- src_stride_v = -src_stride_v;
}
-
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- I420CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- I420CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
- I420CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
return 0;
}
-// SetRows32 writes 'count' bytes using a 32 bit value repeated
-
-#if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED)
-#define HAS_SETROW_NEON
-static void SetRow32_NEON(uint8* dst, uint32 v32, int count) {
- __asm__ volatile
- (
- "vdup.u32 q0, %2 \n" // duplicate 4 ints
- "1:\n"
- "vst1.u32 {q0}, [%0]! \n" // store
- "subs %1, %1, #16 \n" // 16 processed per loop
- "bhi 1b \n"
- : "+r"(dst), // %0
- "+r"(count) // %1
- : "r"(v32) // %2
- : "q0", "memory"
- );
-}
-
-#elif defined(WIN32) && !defined(COVERAGE_ENABLED)
-#define HAS_SETROW_SSE2
-__declspec(naked)
-static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) {
- __asm {
- mov eax, [esp + 4] // dst
- movd xmm7, [esp + 8] // v32
- mov ecx, [esp + 12] // count
- pshufd xmm7, xmm7, 0
-
- wloop:
- movdqa [eax], xmm7
- lea eax, [eax + 16]
- sub ecx, 16
- ja wloop
- ret
+// Mirror a plane of data
+void MirrorPlane(const uint8* src_y, int src_stride_y,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height) {
+ void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
+#if defined(HAS_MIRRORROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
+ MirrorRow = MirrorRow_NEON;
}
-}
-
-#elif (defined(__x86_64__) || defined(__i386__)) && \
- !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
-
-#define HAS_SETROW_SSE2
-static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) {
- asm volatile(
- "movd %2, %%xmm7\n"
- "pshufd $0x0,%%xmm7,%%xmm7\n"
-"1:"
- "movdqa %%xmm7,(%0)\n"
- "lea 0x10(%0),%0\n"
- "sub $0x10,%1\n"
- "ja 1b\n"
- : "+r"(dst), // %0
- "+r"(count) // %1
- : "r"(v32) // %2
- : "memory"
-);
-}
#endif
-
-static void SetRow8_C(uint8* dst, uint32 v8, int count) {
- memset(dst, v8, count);
-}
-
-static void I420SetPlane(uint8* dst_y, int dst_stride_y,
- int width, int height,
- int value) {
- void (*SetRow)(uint8* dst, uint32 value, int pix);
-#if defined(HAS_SETROW_NEON)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
- (width % 16 == 0) &&
- IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
- SetRow = SetRow32_NEON;
- } else
-#elif defined(HAS_SETROW_SSE2)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
- (width % 16 == 0) &&
- IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
- SetRow = SetRow32_SSE2;
- } else
+#if defined(HAS_MIRRORROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
+ MirrorRow = MirrorRow_SSE2;
+#if defined(HAS_MIRRORROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) &&
+ IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16)) {
+ MirrorRow = MirrorRow_SSSE3;
+ }
#endif
- {
- SetRow = SetRow8_C;
}
+#endif
- uint32 v32 = value | (value << 8) | (value << 16) | (value << 24);
- // Set plane
+ // Mirror plane
for (int y = 0; y < height; ++y) {
- SetRow(dst_y, v32, width);
+ MirrorRow(src_y, dst_y, width);
+ src_y += src_stride_y;
dst_y += dst_stride_y;
}
}
-// Draw a rectangle into I420
-int I420Rect(uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int x, int y,
- int width, int height,
- int value_y, int value_u, int value_v) {
- if (!dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0 ||
- x < 0 || y < 0 ||
- value_y < 0 || value_y > 255 ||
- value_u < 0 || value_u > 255 ||
- value_v < 0 || value_v > 255) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- int halfheight = (height + 1) >> 1;
- dst_y = dst_y + (height - 1) * dst_stride_y;
- dst_u = dst_u + (halfheight - 1) * dst_stride_u;
- dst_v = dst_v + (halfheight - 1) * dst_stride_v;
- dst_stride_y = -dst_stride_y;
- dst_stride_u = -dst_stride_u;
- dst_stride_v = -dst_stride_v;
- }
-
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- uint8* start_y = dst_y + y * dst_stride_y + x;
- uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
- uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
-
- I420SetPlane(start_y, dst_stride_y, width, height, value_y);
- I420SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u);
- I420SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v);
- return 0;
-}
-
-// Helper function to copy yuv data without scaling. Used
-// by our jpeg conversion callbacks to incrementally fill a yuv image.
-int I422ToI420(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
+// Convert YUY2 to I422.
+LIBYUV_API
+int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
@@ -332,1244 +115,1314 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
// Negative height means invert the image.
if (height < 0) {
height = -height;
- src_y = src_y + (height - 1) * src_stride_y;
- src_u = src_u + (height - 1) * src_stride_u;
- src_v = src_v + (height - 1) * src_stride_v;
- src_stride_y = -src_stride_y;
- src_stride_u = -src_stride_u;
- src_stride_v = -src_stride_v;
+ src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+ src_stride_yuy2 = -src_stride_yuy2;
}
-
- // Copy Y plane
- I420CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-
- // SubSample UV planes.
- int x, y;
- int halfwidth = (width + 1) >> 1;
- for (y = 0; y < height; y += 2) {
- const uint8* u0 = src_u;
- const uint8* u1 = src_u + src_stride_u;
- if ((y + 1) >= height) {
- u1 = u0;
+ void (*YUY2ToUV422Row)(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+ void (*YUY2ToYRow)(const uint8* src_yuy2,
+ uint8* dst_y, int pix);
+ YUY2ToYRow = YUY2ToYRow_C;
+ YUY2ToUV422Row = YUY2ToUV422Row_C;
+#if defined(HAS_YUY2TOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ if (width > 16) {
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
+ YUY2ToYRow = YUY2ToYRow_Any_SSE2;
}
- for (x = 0; x < halfwidth; ++x) {
- dst_u[x] = (u0[x] + u1[x] + 1) >> 1;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
+ YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
+ if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
+ YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ YUY2ToYRow = YUY2ToYRow_SSE2;
+ }
+ }
}
- src_u += src_stride_u * 2;
- dst_u += dst_stride_u;
}
- for (y = 0; y < height; y += 2) {
- const uint8* v0 = src_v;
- const uint8* v1 = src_v + src_stride_v;
- if ((y + 1) >= height) {
- v1 = v0;
+#elif defined(HAS_YUY2TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ if (width > 8) {
+ YUY2ToYRow = YUY2ToYRow_Any_NEON;
+ if (width > 16) {
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
+ }
}
- for (x = 0; x < halfwidth; ++x) {
- dst_v[x] = (v0[x] + v1[x] + 1) >> 1;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToYRow = YUY2ToYRow_NEON;
+ YUY2ToUV422Row = YUY2ToUV422Row_NEON;
}
- src_v += src_stride_v * 2;
- dst_v += dst_stride_v;
- }
- return 0;
-}
-
-static void I420CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
- uint8* dst, int dst_stride_frame,
- int width, int height) {
- // Copy plane
- for (int y = 0; y < height; y += 2) {
- memcpy(dst, src, width);
- src += src_stride_0;
- dst += dst_stride_frame;
- memcpy(dst, src, width);
- src += src_stride_1;
- dst += dst_stride_frame;
}
-}
-
-// Support converting from FOURCC_M420
-// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for
-// easy conversion to I420.
-// M420 format description:
-// M420 is row biplanar 420: 2 rows of Y and 1 row of VU.
-// Chroma is half width / half height. (420)
-// src_stride_m420 is row planar. Normally this will be the width in pixels.
-// The UV plane is half width, but 2 values, so src_stride_m420 applies to
-// this as well as the two Y planes.
-static int X420ToI420(const uint8* src_y,
- int src_stride_y0, int src_stride_y1,
- const uint8* src_uv, int src_stride_uv,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- int halfheight = (height + 1) >> 1;
- dst_y = dst_y + (height - 1) * dst_stride_y;
- dst_u = dst_u + (halfheight - 1) * dst_stride_u;
- dst_v = dst_v + (halfheight - 1) * dst_stride_v;
- dst_stride_y = -dst_stride_y;
- dst_stride_u = -dst_stride_u;
- dst_stride_v = -dst_stride_v;
- }
-
- int halfwidth = (width + 1) >> 1;
- void (*SplitUV)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
-#if defined(HAS_SPLITUV_NEON)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
- (halfwidth % 16 == 0) &&
- IS_ALIGNED(src_uv, 16) && (src_stride_uv % 16 == 0) &&
- IS_ALIGNED(dst_u, 16) && (dst_stride_u % 16 == 0) &&
- IS_ALIGNED(dst_v, 16) && (dst_stride_v % 16 == 0)) {
- SplitUV = SplitUV_NEON;
- } else
-#elif defined(HAS_SPLITUV_SSE2)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
- (halfwidth % 16 == 0) &&
- IS_ALIGNED(src_uv, 16) && (src_stride_uv % 16 == 0) &&
- IS_ALIGNED(dst_u, 16) && (dst_stride_u % 16 == 0) &&
- IS_ALIGNED(dst_v, 16) && (dst_stride_v % 16 == 0)) {
- SplitUV = SplitUV_SSE2;
- } else
#endif
- {
- SplitUV = SplitUV_C;
- }
-
- I420CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y,
- width, height);
- int halfheight = (height + 1) >> 1;
- for (int y = 0; y < halfheight; ++y) {
- // Copy a row of UV.
- SplitUV(src_uv, dst_u, dst_v, halfwidth);
+ for (int y = 0; y < height; ++y) {
+ YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
+ YUY2ToYRow(src_yuy2, dst_y, width);
+ src_yuy2 += src_stride_yuy2;
+ dst_y += dst_stride_y;
dst_u += dst_stride_u;
dst_v += dst_stride_v;
- src_uv += src_stride_uv;
}
return 0;
}
-// Convert M420 to I420.
-int M420ToI420(const uint8* src_m420, int src_stride_m420,
+// Convert UYVY to I422.
+LIBYUV_API
+int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
- return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2,
- src_m420 + src_stride_m420 * 2, src_stride_m420 * 3,
- dst_y, dst_stride_y,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
- width, height);
-}
-
-// Convert NV12 to I420.
-int NV12ToI420(const uint8* src_y, int src_stride_y,
- const uint8* src_uv, int src_stride_uv,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- return X420ToI420(src_y, src_stride_y, src_stride_y,
- src_uv, src_stride_uv,
- dst_y, dst_stride_y,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
- width, height);
-}
-
-// Convert NV12 to I420. Deprecated.
-int NV12ToI420(const uint8* src_y,
- const uint8* src_uv,
- int src_stride_frame,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- return X420ToI420(src_y, src_stride_frame, src_stride_frame,
- src_uv, src_stride_frame,
- dst_y, dst_stride_y,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
- width, height);
-}
-
-#if defined(WIN32) && !defined(COVERAGE_ENABLED)
-#define HAS_SPLITYUY2_SSE2
-__declspec(naked)
-static void SplitYUY2_SSE2(const uint8* src_yuy2,
- uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_yuy2
- mov edx, [esp + 8 + 8] // dst_y
- mov esi, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
- psrlw xmm7, 8
-
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- pand xmm2, xmm7 // even bytes are Y
- pand xmm3, xmm7
- packuswb xmm2, xmm3
- movdqa [edx], xmm2
- lea edx, [edx + 16]
- psrlw xmm0, 8 // YUYV -> UVUV
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- movdqa xmm1, xmm0
- pand xmm0, xmm7 // U
- packuswb xmm0, xmm0
- movq qword ptr [esi], xmm0
- lea esi, [esi + 8]
- psrlw xmm1, 8 // V
- packuswb xmm1, xmm1
- movq qword ptr [edi], xmm1
- lea edi, [edi + 8]
- sub ecx, 16
- ja wloop
-
- pop edi
- pop esi
- ret
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+ src_stride_uyvy = -src_stride_uyvy;
+ }
+ void (*UYVYToUV422Row)(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+ void (*UYVYToYRow)(const uint8* src_uyvy,
+ uint8* dst_y, int pix);
+ UYVYToYRow = UYVYToYRow_C;
+ UYVYToUV422Row = UYVYToUV422Row_C;
+#if defined(HAS_UYVYTOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ if (width > 16) {
+ UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
+ UYVYToYRow = UYVYToYRow_Any_SSE2;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2;
+ UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
+ if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
+ UYVYToUV422Row = UYVYToUV422Row_SSE2;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ UYVYToYRow = UYVYToYRow_SSE2;
+ }
+ }
+ }
+ }
+#elif defined(HAS_UYVYTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ if (width > 8) {
+ UYVYToYRow = UYVYToYRow_Any_NEON;
+ if (width > 16) {
+ UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
+ }
+ }
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToYRow = UYVYToYRow_NEON;
+ UYVYToUV422Row = UYVYToUV422Row_NEON;
+ }
}
-}
-
-#elif (defined(__x86_64__) || defined(__i386__)) && \
- !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
-#define HAS_SPLITYUY2_SSE2
-static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile(
- "pcmpeqb %%xmm7,%%xmm7\n"
- "psrlw $0x8,%%xmm7\n"
-"1:"
- "movdqa (%0),%%xmm0\n"
- "movdqa 0x10(%0),%%xmm1\n"
- "lea 0x20(%0),%0\n"
- "movdqa %%xmm0,%%xmm2\n"
- "movdqa %%xmm1,%%xmm3\n"
- "pand %%xmm7,%%xmm2\n"
- "pand %%xmm7,%%xmm3\n"
- "packuswb %%xmm3,%%xmm2\n"
- "movdqa %%xmm2,(%1)\n"
- "lea 0x10(%1),%1\n"
- "psrlw $0x8,%%xmm0\n"
- "psrlw $0x8,%%xmm1\n"
- "packuswb %%xmm1,%%xmm0\n"
- "movdqa %%xmm0,%%xmm1\n"
- "pand %%xmm7,%%xmm0\n"
- "packuswb %%xmm0,%%xmm0\n"
- "movq %%xmm0,(%2)\n"
- "lea 0x8(%2),%2\n"
- "psrlw $0x8,%%xmm1\n"
- "packuswb %%xmm1,%%xmm1\n"
- "movq %%xmm1,(%3)\n"
- "lea 0x8(%3),%3\n"
- "sub $0x10,%4\n"
- "ja 1b\n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_y), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(pix) // %4
- :
- : "memory"
-);
-}
#endif
-static void SplitYUY2_C(const uint8* src_yuy2,
- uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix) {
- // Copy a row of YUY2.
- for (int x = 0; x < pix; x += 2) {
- dst_y[0] = src_yuy2[0];
- dst_y[1] = src_yuy2[2];
- dst_u[0] = src_yuy2[1];
- dst_v[0] = src_yuy2[3];
- src_yuy2 += 4;
- dst_y += 2;
- dst_u += 1;
- dst_v += 1;
+ for (int y = 0; y < height; ++y) {
+ UYVYToUV422Row(src_uyvy, dst_u, dst_v, width);
+ UYVYToYRow(src_uyvy, dst_y, width);
+ src_uyvy += src_stride_uyvy;
+ dst_y += dst_stride_y;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
}
+ return 0;
}
-// Convert Q420 to I420.
-// Format is rows of YY/YUYV
-int Q420ToI420(const uint8* src_y, int src_stride_y,
- const uint8* src_yuy2, int src_stride_yuy2,
+// Mirror I420 with optional flipping
+LIBYUV_API
+int I420Mirror(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
+ if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
// Negative height means invert the image.
if (height < 0) {
height = -height;
int halfheight = (height + 1) >> 1;
- dst_y = dst_y + (height - 1) * dst_stride_y;
- dst_u = dst_u + (halfheight - 1) * dst_stride_u;
- dst_v = dst_v + (halfheight - 1) * dst_stride_v;
- dst_stride_y = -dst_stride_y;
- dst_stride_u = -dst_stride_u;
- dst_stride_v = -dst_stride_v;
- }
- void (*SplitYUY2)(const uint8* src_yuy2,
- uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix);
-#if defined(HAS_SPLITYUY2_SSE2)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
- (width % 16 == 0) &&
- IS_ALIGNED(src_yuy2, 16) && (src_stride_yuy2 % 16 == 0) &&
- IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) &&
- IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
- IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
- SplitYUY2 = SplitYUY2_SSE2;
- } else
-#endif
- {
- SplitYUY2 = SplitYUY2_C;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
}
- for (int y = 0; y < height; y += 2) {
- memcpy(dst_y, src_y, width);
- dst_y += dst_stride_y;
- src_y += src_stride_y;
- // Copy a row of YUY2.
- SplitYUY2(src_yuy2, dst_y, dst_u, dst_v, width);
- dst_y += dst_stride_y;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- src_yuy2 += src_stride_yuy2;
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (dst_y) {
+ MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
}
+ MirrorPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+ MirrorPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
return 0;
}
-#if defined(WIN32) && !defined(COVERAGE_ENABLED)
-#define HAS_YUY2TOI420ROW_SSE2
-__declspec(naked)
-void YUY2ToI420RowY_SSE2(const uint8* src_yuy2,
- uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] // src_yuy2
- mov edx, [esp + 8] // dst_y
- mov ecx, [esp + 12] // pix
- pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
- psrlw xmm7, 8
-
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- pand xmm0, xmm7 // even bytes are Y
- pand xmm1, xmm7
- packuswb xmm0, xmm1
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 16
- ja wloop
- ret
+// ARGB mirror.
+LIBYUV_API
+int ARGBMirror(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
}
-}
-
-__declspec(naked)
-void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_y, int pix) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_yuy2
- mov esi, [esp + 8 + 8] // stride_yuy2
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
- psrlw xmm7, 8
-
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + esi]
- movdqa xmm3, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm0, xmm2
- pavgb xmm1, xmm3
- psrlw xmm0, 8 // YUYV -> UVUV
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- movdqa xmm1, xmm0
- pand xmm0, xmm7 // U
- packuswb xmm0, xmm0
- movq qword ptr [edx], xmm0
- lea edx, [edx + 8]
- psrlw xmm1, 8 // V
- packuswb xmm1, xmm1
- movq qword ptr [edi], xmm1
- lea edi, [edi + 8]
- sub ecx, 16
- ja wloop
-
- pop edi
- pop esi
- ret
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
}
-}
-#define HAS_UYVYTOI420ROW_SSE2
-__declspec(naked)
-void UYVYToI420RowY_SSE2(const uint8* src_uyvy,
- uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] // src_uyvy
- mov edx, [esp + 8] // dst_y
- mov ecx, [esp + 12] // pix
-
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- psrlw xmm0, 8 // odd bytes are Y
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 16
- ja wloop
- ret
+ void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
+ ARGBMirrorRow_C;
+#if defined(HAS_ARGBMIRRORROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGBMirrorRow = ARGBMirrorRow_SSSE3;
}
-}
+#endif
-__declspec(naked)
-void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_y, int pix) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_yuy2
- mov esi, [esp + 8 + 8] // stride_yuy2
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
- psrlw xmm7, 8
-
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + esi]
- movdqa xmm3, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm0, xmm2
- pavgb xmm1, xmm3
- pand xmm0, xmm7 // UYVY -> UVUV
- pand xmm1, xmm7
- packuswb xmm0, xmm1
- movdqa xmm1, xmm0
- pand xmm0, xmm7 // U
- packuswb xmm0, xmm0
- movq qword ptr [edx], xmm0
- lea edx, [edx + 8]
- psrlw xmm1, 8 // V
- packuswb xmm1, xmm1
- movq qword ptr [edi], xmm1
- lea edi, [edi + 8]
- sub ecx, 16
- ja wloop
-
- pop edi
- pop esi
- ret
+ // Mirror plane
+ for (int y = 0; y < height; ++y) {
+ ARGBMirrorRow(src_argb, dst_argb, width);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
}
+ return 0;
}
-#elif (defined(__x86_64__) || defined(__i386__)) && \
- !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
-
-#define HAS_YUY2TOI420ROW_SSE2
-static void YUY2ToI420RowY_SSE2(const uint8* src_yuy2,
- uint8* dst_y, int pix) {
- asm volatile(
- "pcmpeqb %%xmm7,%%xmm7\n"
- "psrlw $0x8,%%xmm7\n"
-"1:"
- "movdqa (%0),%%xmm0\n"
- "movdqa 0x10(%0),%%xmm1\n"
- "lea 0x20(%0),%0\n"
- "pand %%xmm7,%%xmm0\n"
- "pand %%xmm7,%%xmm1\n"
- "packuswb %%xmm1,%%xmm0\n"
- "movdqa %%xmm0,(%1)\n"
- "lea 0x10(%1),%1\n"
- "sub $0x10,%2\n"
- "ja 1b\n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "memory"
-);
-}
-
-static void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_y, int pix) {
- asm volatile(
- "pcmpeqb %%xmm7,%%xmm7\n"
- "psrlw $0x8,%%xmm7\n"
-"1:"
- "movdqa (%0),%%xmm0\n"
- "movdqa 0x10(%0),%%xmm1\n"
- "movdqa (%0,%4,1),%%xmm2\n"
- "movdqa 0x10(%0,%4,1),%%xmm3\n"
- "lea 0x20(%0),%0\n"
- "pavgb %%xmm2,%%xmm0\n"
- "pavgb %%xmm3,%%xmm1\n"
- "psrlw $0x8,%%xmm0\n"
- "psrlw $0x8,%%xmm1\n"
- "packuswb %%xmm1,%%xmm0\n"
- "movdqa %%xmm0,%%xmm1\n"
- "pand %%xmm7,%%xmm0\n"
- "packuswb %%xmm0,%%xmm0\n"
- "movq %%xmm0,(%1)\n"
- "lea 0x8(%1),%1\n"
- "psrlw $0x8,%%xmm1\n"
- "packuswb %%xmm1,%%xmm1\n"
- "movq %%xmm1,(%2)\n"
- "lea 0x8(%2),%2\n"
- "sub $0x10,%3\n"
- "ja 1b\n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_u), // %1
- "+r"(dst_y), // %2
- "+r"(pix) // %3
- : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
- : "memory"
-);
-}
-#define HAS_UYVYTOI420ROW_SSE2
-static void UYVYToI420RowY_SSE2(const uint8* src_uyvy,
- uint8* dst_y, int pix) {
- asm volatile(
-"1:"
- "movdqa (%0),%%xmm0\n"
- "movdqa 0x10(%0),%%xmm1\n"
- "lea 0x20(%0),%0\n"
- "psrlw $0x8,%%xmm0\n"
- "psrlw $0x8,%%xmm1\n"
- "packuswb %%xmm1,%%xmm0\n"
- "movdqa %%xmm0,(%1)\n"
- "lea 0x10(%1),%1\n"
- "sub $0x10,%2\n"
- "ja 1b\n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "memory"
-);
-}
-
-static void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_y, int pix) {
- asm volatile(
- "pcmpeqb %%xmm7,%%xmm7\n"
- "psrlw $0x8,%%xmm7\n"
-"1:"
- "movdqa (%0),%%xmm0\n"
- "movdqa 0x10(%0),%%xmm1\n"
- "movdqa (%0,%4,1),%%xmm2\n"
- "movdqa 0x10(%0,%4,1),%%xmm3\n"
- "lea 0x20(%0),%0\n"
- "pavgb %%xmm2,%%xmm0\n"
- "pavgb %%xmm3,%%xmm1\n"
- "pand %%xmm7,%%xmm0\n"
- "pand %%xmm7,%%xmm1\n"
- "packuswb %%xmm1,%%xmm0\n"
- "movdqa %%xmm0,%%xmm1\n"
- "pand %%xmm7,%%xmm0\n"
- "packuswb %%xmm0,%%xmm0\n"
- "movq %%xmm0,(%1)\n"
- "lea 0x8(%1),%1\n"
- "psrlw $0x8,%%xmm1\n"
- "packuswb %%xmm1,%%xmm1\n"
- "movq %%xmm1,(%2)\n"
- "lea 0x8(%2),%2\n"
- "sub $0x10,%3\n"
- "ja 1b\n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_u), // %1
- "+r"(dst_y), // %2
- "+r"(pix) // %3
- : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
- : "memory"
-);
-}
+// Get a blender that optimized for the CPU, alignment and pixel count.
+// As there are 6 blenders to choose from, the caller should try to use
+// the same blend function for all pixels if possible.
+LIBYUV_API
+ARGBBlendRow GetARGBBlend() {
+ void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width) = ARGBBlendRow_C;
+#if defined(HAS_ARGBBLENDROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBBlendRow = ARGBBlendRow_SSSE3;
+ return ARGBBlendRow;
+ }
#endif
-
-// Filter 2 rows of YUY2 UV's (422) into U and V (420)
-void YUY2ToI420RowUV_C(const uint8* src_yuy2, int src_stride_yuy2,
- uint8* dst_u, uint8* dst_v, int pix) {
- // Output a row of UV values, filtering 2 rows of YUY2
- for (int x = 0; x < pix; x += 2) {
- dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
- dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
- src_yuy2 += 4;
- dst_u += 1;
- dst_v += 1;
+#if defined(HAS_ARGBBLENDROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBBlendRow = ARGBBlendRow_SSE2;
}
+#endif
+ return ARGBBlendRow;
}
-void YUY2ToI420RowY_C(const uint8* src_yuy2,
- uint8* dst_y, int pix) {
- // Copy a row of yuy2 Y values
- for (int x = 0; x < pix; ++x) {
- dst_y[0] = src_yuy2[0];
- src_yuy2 += 2;
- dst_y += 1;
+// Alpha Blend 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
+ const uint8* src_argb1, int src_stride_argb1,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
}
-}
-
-void UYVYToI420RowUV_C(const uint8* src_uyvy, int src_stride_uyvy,
- uint8* dst_u, uint8* dst_v, int pix) {
- // Copy a row of uyvy UV values
- for (int x = 0; x < pix; x += 2) {
- dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
- dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
- src_uyvy += 4;
- dst_u += 1;
- dst_v += 1;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
}
-}
+ void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width) = GetARGBBlend();
-void UYVYToI420RowY_C(const uint8* src_uyvy,
- uint8* dst_y, int pix) {
- // Copy a row of uyvy Y values
- for (int x = 0; x < pix; ++x) {
- dst_y[0] = src_uyvy[1];
- src_uyvy += 2;
- dst_y += 1;
+ for (int y = 0; y < height; ++y) {
+ ARGBBlendRow(src_argb0, src_argb1, dst_argb, width);
+ src_argb0 += src_stride_argb0;
+ src_argb1 += src_stride_argb1;
+ dst_argb += dst_stride_argb;
}
+ return 0;
}
-// Convert YUY2 to I420.
-int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
+// Convert ARGB to I400.
+LIBYUV_API
+int ARGBToI400(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
int width, int height) {
- // Negative height means invert the image.
+ if (!src_argb || !dst_y || width <= 0 || height == 0) {
+ return -1;
+ }
if (height < 0) {
height = -height;
- src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
- src_stride_yuy2 = -src_stride_yuy2;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
}
- void (*YUY2ToI420RowUV)(const uint8* src_yuy2, int src_stride_yuy2,
- uint8* dst_u, uint8* dst_v, int pix);
- void (*YUY2ToI420RowY)(const uint8* src_yuy2,
- uint8* dst_y, int pix);
-#if defined(HAS_YUY2TOI420ROW_SSE2)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
- (width % 16 == 0) &&
- IS_ALIGNED(src_yuy2, 16) && (src_stride_yuy2 % 16 == 0) &&
- IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) &&
- IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
- IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
- YUY2ToI420RowY = YUY2ToI420RowY_SSE2;
- YUY2ToI420RowUV = YUY2ToI420RowUV_SSE2;
- } else
-#endif
- {
- YUY2ToI420RowY = YUY2ToI420RowY_C;
- YUY2ToI420RowUV = YUY2ToI420RowUV_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+ ARGBToYRow_C;
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) &&
+ IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+ IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
}
+#endif
+
for (int y = 0; y < height; ++y) {
- if ((y & 1) == 0) {
- if (y >= (height - 1) ) { // last chroma on odd height clamp height
- src_stride_yuy2 = 0;
- }
- YUY2ToI420RowUV(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- YUY2ToI420RowY(src_yuy2, dst_y, width);
+ ARGBToYRow(src_argb, dst_y, width);
+ src_argb += src_stride_argb;
dst_y += dst_stride_y;
- src_yuy2 += src_stride_yuy2;
}
return 0;
}
-// Convert UYVY to I420.
-int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
+// ARGB little endian (bgra in memory) to I422
+// same as I420 except UV plane is full height
+LIBYUV_API
+int ARGBToI422(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
- // Negative height means invert the image.
+ if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
if (height < 0) {
height = -height;
- src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
- src_stride_uyvy = -src_stride_uyvy;
- }
- void (*UYVYToI420RowUV)(const uint8* src_uyvy, int src_stride_uyvy,
- uint8* dst_u, uint8* dst_v, int pix);
- void (*UYVYToI420RowY)(const uint8* src_uyvy,
- uint8* dst_y, int pix);
-#if defined(HAS_UYVYTOI420ROW_SSE2)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
- (width % 16 == 0) &&
- IS_ALIGNED(src_uyvy, 16) && (src_stride_uyvy % 16 == 0) &&
- IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) &&
- IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
- IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
- UYVYToI420RowY = UYVYToI420RowY_SSE2;
- UYVYToI420RowUV = UYVYToI420RowUV_SSE2;
- } else
-#endif
- {
- UYVYToI420RowY = UYVYToI420RowY_C;
- UYVYToI420RowUV = UYVYToI420RowUV_C;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
}
- for (int y = 0; y < height; ++y) {
- if ((y & 1) == 0) {
- if (y >= (height - 1) ) { // last chroma on odd height clamp height
- src_stride_uyvy = 0;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+ ARGBToYRow_C;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ if (width > 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
}
- UYVYToI420RowUV(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
}
- UYVYToI420RowY(src_uyvy, dst_y, width);
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
+ ARGBToYRow(src_argb, dst_y, width);
+ src_argb += src_stride_argb;
dst_y += dst_stride_y;
- src_uyvy += src_stride_uyvy;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
}
return 0;
}
-// Convert I420 to ARGB.
-// TODO(fbarchard): Add SSE2 version and supply C version for fallback.
-int I420ToARGB(const uint8* src_y, int src_stride_y,
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
+ uint8* dst_bgra, int dst_stride_bgra,
int width, int height) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_bgra ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
// Negative height means invert the image.
if (height < 0) {
height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
+ dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;
+ dst_stride_bgra = -dst_stride_bgra;
+ }
+ void (*I422ToBGRARow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToBGRARow_C;
+#if defined(HAS_I422TOBGRAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToBGRARow = I422ToBGRARow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToBGRARow = I422ToBGRARow_NEON;
+ }
+ }
+#elif defined(HAS_I422TOBGRAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {
+ I422ToBGRARow = I422ToBGRARow_SSSE3;
+ }
+ }
}
+#endif
+
for (int y = 0; y < height; ++y) {
- FastConvertYUVToRGB32Row(src_y, src_u, src_v, dst_argb, width);
- dst_argb += dst_stride_argb;
+ I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);
+ dst_bgra += dst_stride_bgra;
src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
+ src_u += src_stride_u;
+ src_v += src_stride_v;
}
- // MMX used for FastConvertYUVToRGB32Row requires an emms instruction.
- EMMS();
return 0;
}
-// Convert I420 to BGRA.
-int I420ToBGRA(const uint8* src_y, int src_stride_y,
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
+ uint8* dst_abgr, int dst_stride_abgr,
int width, int height) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_abgr ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
// Negative height means invert the image.
if (height < 0) {
height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
+ dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
+ dst_stride_abgr = -dst_stride_abgr;
+ }
+ void (*I422ToABGRRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToABGRRow_C;
+#if defined(HAS_I422TOABGRROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToABGRRow = I422ToABGRRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToABGRRow = I422ToABGRRow_NEON;
+ }
+ }
+#elif defined(HAS_I422TOABGRROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {
+ I422ToABGRRow = I422ToABGRRow_SSSE3;
+ }
+ }
}
+#endif
+
for (int y = 0; y < height; ++y) {
- FastConvertYUVToBGRARow(src_y, src_u, src_v, dst_argb, width);
- dst_argb += dst_stride_argb;
+ I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
+ dst_abgr += dst_stride_abgr;
src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
+ src_u += src_stride_u;
+ src_v += src_stride_v;
}
- EMMS();
return 0;
}
-// Convert I420 to BGRA.
-int I420ToABGR(const uint8* src_y, int src_stride_y,
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
+ uint8* dst_rgba, int dst_stride_rgba,
int width, int height) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_rgba ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
// Negative height means invert the image.
if (height < 0) {
height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
+ dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+ dst_stride_rgba = -dst_stride_rgba;
+ }
+ void (*I422ToRGBARow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToRGBARow_C;
+#if defined(HAS_I422TORGBAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGBARow = I422ToRGBARow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGBARow = I422ToRGBARow_NEON;
+ }
}
+#elif defined(HAS_I422TORGBAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {
+ I422ToRGBARow = I422ToRGBARow_SSSE3;
+ }
+ }
+ }
+#endif
+
for (int y = 0; y < height; ++y) {
- FastConvertYUVToABGRRow(src_y, src_u, src_v, dst_argb, width);
- dst_argb += dst_stride_argb;
+ I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width);
+ dst_rgba += dst_stride_rgba;
src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
+ src_u += src_stride_u;
+ src_v += src_stride_v;
}
- EMMS();
return 0;
}
-// Convert I422 to ARGB.
-int I422ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
+// Convert ARGB to RGBA.
+LIBYUV_API
+int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_rgba, int dst_stride_rgba,
int width, int height) {
+ if (!src_argb || !dst_rgba ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
// Negative height means invert the image.
if (height < 0) {
height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ void (*ARGBToRGBARow)(const uint8* src_argb, uint8* dst_rgba, int pix) =
+ ARGBToRGBARow_C;
+#if defined(HAS_ARGBTORGBAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) &&
+ IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+ IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {
+ ARGBToRGBARow = ARGBToRGBARow_SSSE3;
+ }
+#endif
+#if defined(HAS_ARGBTORGBAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ ARGBToRGBARow = ARGBToRGBARow_NEON;
}
+#endif
+
for (int y = 0; y < height; ++y) {
- FastConvertYUVToRGB32Row(src_y, src_u, src_v, dst_argb, width);
- dst_argb += dst_stride_argb;
- src_y += src_stride_y;
- src_u += src_stride_u;
- src_v += src_stride_v;
+ ARGBToRGBARow(src_argb, dst_rgba, width);
+ src_argb += src_stride_argb;
+ dst_rgba += dst_stride_rgba;
}
- // MMX used for FastConvertYUVToRGB32Row requires an emms instruction.
- EMMS();
return 0;
}
-// Convert I444 to ARGB.
-int I444ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- // Negative height means invert the image.
+// Convert ARGB To RGB24.
+LIBYUV_API
+int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_rgb24, int dst_stride_rgb24,
+ int width, int height) {
+ if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) {
+ return -1;
+ }
if (height < 0) {
height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+ ARGBToRGB24Row_C;
+#if defined(HAS_ARGBTORGB24ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ if (width * 3 <= kMaxStride) {
+ ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
+ }
+ if (IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ if (width * 3 <= kMaxStride) {
+ ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
+ }
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_NEON;
+ }
}
+#endif
+
for (int y = 0; y < height; ++y) {
- FastConvertYUV444ToRGB32Row(src_y, src_u, src_v, dst_argb, width);
- dst_argb += dst_stride_argb;
- src_y += src_stride_y;
- src_u += src_stride_u;
- src_v += src_stride_v;
+ ARGBToRGB24Row(src_argb, dst_rgb24, width);
+ src_argb += src_stride_argb;
+ dst_rgb24 += dst_stride_rgb24;
}
- // MMX used for FastConvertYUVToRGB32Row requires an emms instruction.
- EMMS();
return 0;
}
-// Convert I400 to ARGB.
-int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- // Negative height means invert the image.
+// Convert ARGB To RAW.
+LIBYUV_API
+int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_raw, int dst_stride_raw,
+ int width, int height) {
+ if (!src_argb || !dst_raw || width <= 0 || height == 0) {
+ return -1;
+ }
if (height < 0) {
height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
}
+ void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+ ARGBToRAWRow_C;
+#if defined(HAS_ARGBTORAWROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ if (width * 3 <= kMaxStride) {
+ ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
+ }
+ if (IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) {
+ ARGBToRAWRow = ARGBToRAWRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORAWROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ if (width * 3 <= kMaxStride) {
+ ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
+ }
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRAWRow = ARGBToRAWRow_NEON;
+ }
+ }
+#endif
+
for (int y = 0; y < height; ++y) {
- FastConvertYToRGB32Row(src_y, dst_argb, width);
- dst_argb += dst_stride_argb;
- src_y += src_stride_y;
+ ARGBToRAWRow(src_argb, dst_raw, width);
+ src_argb += src_stride_argb;
+ dst_raw += dst_stride_raw;
}
- // MMX used for FastConvertYUVToRGB32Row requires an emms instruction.
- EMMS();
return 0;
}
-// TODO(fbarchard): 64 bit version
-#if defined(WIN32) && !defined(COVERAGE_ENABLED)
-
-#define HAS_I400TOARGBROW_SSE2
-__declspec(naked)
-static void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
- __asm {
- mov eax, [esp + 4] // src_y
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // pix
- pcmpeqb xmm7, xmm7 // generate mask 0xff000000
- pslld xmm7, 24
-
- wloop:
- movq xmm0, qword ptr [eax]
- lea eax, [eax + 8]
- punpcklbw xmm0, xmm0
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm0
- punpckhwd xmm1, xmm1
- por xmm0, xmm7
- por xmm1, xmm7
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- ja wloop
- ret
+// Convert ARGB To RGB565.
+LIBYUV_API
+int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_rgb565, int dst_stride_rgb565,
+ int width, int height) {
+ if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
}
-}
-
-#define HAS_ABGRTOARGBROW_SSSE3
-__declspec(naked)
-static void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb,
- int pix) {
-__asm {
- mov eax, [esp + 4] // src_abgr
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // pix
- movdqa xmm7, _kShuffleMaskABGRToARGB
-
- convertloop :
- movdqa xmm0, [eax]
- lea eax, [eax + 16]
- pshufb xmm0, xmm7
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 4
- ja convertloop
- ret
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
}
-}
+ void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+ ARGBToRGB565Row_C;
+#if defined(HAS_ARGBTORGB565ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ if (width * 2 <= kMaxStride) {
+ ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
+ }
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
+ }
+ }
+#endif
-#define HAS_BGRATOARGBROW_SSSE3
-__declspec(naked)
-static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb,
- int pix) {
-__asm {
- mov eax, [esp + 4] // src_bgra
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // pix
- movdqa xmm7, _kShuffleMaskBGRAToARGB
-
- convertloop :
- movdqa xmm0, [eax]
- lea eax, [eax + 16]
- pshufb xmm0, xmm7
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 4
- ja convertloop
- ret
+ for (int y = 0; y < height; ++y) {
+ ARGBToRGB565Row(src_argb, dst_rgb565, width);
+ src_argb += src_stride_argb;
+ dst_rgb565 += dst_stride_rgb565;
}
+ return 0;
}
+// Convert ARGB To ARGB1555.
+LIBYUV_API
+int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb1555, int dst_stride_argb1555,
+ int width, int height) {
+ if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+ ARGBToARGB1555Row_C;
+#if defined(HAS_ARGBTOARGB1555ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ if (width * 2 <= kMaxStride) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2;
+ }
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2;
+ }
+ }
+#endif
-#elif (defined(__x86_64__) || defined(__i386__)) && \
- !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
-
-// TODO(yuche): consider moving ARGB related codes to a separate file.
-#define HAS_I400TOARGBROW_SSE2
-static void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
- asm volatile(
- "pcmpeqb %%xmm7,%%xmm7\n"
- "pslld $0x18,%%xmm7\n"
-"1:"
- "movq (%0),%%xmm0\n"
- "lea 0x8(%0),%0\n"
- "punpcklbw %%xmm0,%%xmm0\n"
- "movdqa %%xmm0,%%xmm1\n"
- "punpcklwd %%xmm0,%%xmm0\n"
- "punpckhwd %%xmm1,%%xmm1\n"
- "por %%xmm7,%%xmm0\n"
- "por %%xmm7,%%xmm1\n"
- "movdqa %%xmm0,(%1)\n"
- "movdqa %%xmm1,0x10(%1)\n"
- "lea 0x20(%1),%1\n"
- "sub $0x8,%2\n"
- "ja 1b\n"
- : "+r"(src_y), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
- :
- : "memory"
-);
+ for (int y = 0; y < height; ++y) {
+ ARGBToARGB1555Row(src_argb, dst_argb1555, width);
+ src_argb += src_stride_argb;
+ dst_argb1555 += dst_stride_argb1555;
+ }
+ return 0;
}
-#define HAS_ABGRTOARGBROW_SSSE3
-static void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb,
- int pix) {
- asm volatile(
- "movdqa (%3),%%xmm7\n"
-"1:"
- "movdqa (%0),%%xmm0\n"
- "lea 0x10(%0),%0\n"
- "pshufb %%xmm7,%%xmm0\n"
- "movdqa %%xmm0,(%1)\n"
- "lea 0x10(%1),%1\n"
- "sub $0x4,%2\n"
- "ja 1b\n"
- : "+r"(src_abgr), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
- : "r"(kShuffleMaskABGRToARGB) // %3
- : "memory"
-);
-}
+// Convert ARGB To ARGB4444.
+LIBYUV_API
+int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb4444, int dst_stride_argb4444,
+ int width, int height) {
+ if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+ ARGBToARGB4444Row_C;
+#if defined(HAS_ARGBTOARGB4444ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ if (width * 2 <= kMaxStride) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2;
+ }
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2;
+ }
+ }
+#endif
-#define HAS_BGRATOARGBROW_SSSE3
-static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb,
- int pix) {
- asm volatile(
- "movdqa (%3),%%xmm7\n"
-"1:"
- "movdqa (%0),%%xmm0\n"
- "lea 0x10(%0),%0\n"
- "pshufb %%xmm7,%%xmm0\n"
- "movdqa %%xmm0,(%1)\n"
- "lea 0x10(%1),%1\n"
- "sub $0x4,%2\n"
- "ja 1b\n"
- : "+r"(src_bgra), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
- : "r"(kShuffleMaskBGRAToARGB) // %3
- : "memory"
-);
+ for (int y = 0; y < height; ++y) {
+ ARGBToARGB4444Row(src_argb, dst_argb4444, width);
+ src_argb += src_stride_argb;
+ dst_argb4444 += dst_stride_argb4444;
+ }
+ return 0;
}
+// Convert NV12 to RGB565.
+// TODO(fbarchard): (Re) Optimize for Neon.
+LIBYUV_API
+int NV12ToRGB565(const uint8* src_y, int src_stride_y,
+ const uint8* src_uv, int src_stride_uv,
+ uint8* dst_rgb565, int dst_stride_rgb565,
+ int width, int height) {
+ if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+ dst_stride_rgb565 = -dst_stride_rgb565;
+ }
+ void (*NV12ToARGBRow)(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* rgb_buf,
+ int width) = NV12ToARGBRow_C;
+#if defined(HAS_NV12TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width * 4 <= kMaxStride) {
+ NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+ }
+#endif
+#if defined(HAS_NV12TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width * 4 <= kMaxStride) {
+ NV12ToARGBRow = NV12ToARGBRow_NEON;
+ }
#endif
-static void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) {
- // Copy a Y to RGB.
- for (int x = 0; x < pix; ++x) {
- uint8 y = src_y[0];
- dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
- dst_argb[3] = 255u;
- dst_argb += 4;
- ++src_y;
+ SIMD_ALIGNED(uint8 row[kMaxStride]);
+ void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+ ARGBToRGB565Row_C;
+#if defined(HAS_ARGBTORGB565ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
}
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ NV12ToARGBRow(src_y, src_uv, row, width);
+ ARGBToRGB565Row(row, dst_rgb565, width);
+ dst_rgb565 += dst_stride_rgb565;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_uv += src_stride_uv;
+ }
+ }
+ return 0;
}
-// Convert I400 to ARGB.
-int I400ToARGB(const uint8* src_y, int src_stride_y,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
+// Convert NV21 to RGB565.
+LIBYUV_API
+int NV21ToRGB565(const uint8* src_y, int src_stride_y,
+ const uint8* src_vu, int src_stride_vu,
+ uint8* dst_rgb565, int dst_stride_rgb565,
+ int width, int height) {
+ if (!src_y || !src_vu || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
if (height < 0) {
height = -height;
- src_y = src_y + (height - 1) * src_stride_y;
- src_stride_y = -src_stride_y;
+ dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+ dst_stride_rgb565 = -dst_stride_rgb565;
+ }
+ void (*NV21ToARGBRow)(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* rgb_buf,
+ int width) = NV21ToARGBRow_C;
+#if defined(HAS_NV21TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width * 4 <= kMaxStride) {
+ NV21ToARGBRow = NV21ToARGBRow_SSSE3;
}
- void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix);
-#if defined(HAS_I400TOARGBROW_SSE2)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
- (width % 8 == 0) &&
- IS_ALIGNED(src_y, 8) && (src_stride_y % 8 == 0) &&
- IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
- I400ToARGBRow = I400ToARGBRow_SSE2;
- } else
#endif
- {
- I400ToARGBRow = I400ToARGBRow_C;
+
+ SIMD_ALIGNED(uint8 row[kMaxStride]);
+ void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+ ARGBToRGB565Row_C;
+#if defined(HAS_ARGBTORGB565ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
}
+#endif
for (int y = 0; y < height; ++y) {
- I400ToARGBRow(src_y, dst_argb, width);
+ NV21ToARGBRow(src_y, src_vu, row, width);
+ ARGBToRGB565Row(row, dst_rgb565, width);
+ dst_rgb565 += dst_stride_rgb565;
src_y += src_stride_y;
- dst_argb += dst_stride_argb;
+ if (y & 1) {
+ src_vu += src_stride_vu;
+ }
}
return 0;
}
-static void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) {
- for (int x = 0; x < pix; ++x) {
- // To support in-place conversion.
- uint8 r = src_abgr[0];
- uint8 g = src_abgr[1];
- uint8 b = src_abgr[2];
- uint8 a = src_abgr[3];
- dst_argb[0] = b;
- dst_argb[1] = g;
- dst_argb[2] = r;
- dst_argb[3] = a;
- dst_argb += 4;
- src_abgr += 4;
+LIBYUV_API
+void SetPlane(uint8* dst_y, int dst_stride_y,
+ int width, int height,
+ uint32 value) {
+ void (*SetRow)(uint8* dst, uint32 value, int pix) = SetRow8_C;
+#if defined(HAS_SETROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) &&
+ IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ SetRow = SetRow8_NEON;
+ }
+#endif
+#if defined(HAS_SETROW_X86)
+ if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
+ SetRow = SetRow8_X86;
+ }
+#endif
+
+ uint32 v32 = value | (value << 8) | (value << 16) | (value << 24);
+ // Set plane
+ for (int y = 0; y < height; ++y) {
+ SetRow(dst_y, v32, width);
+ dst_y += dst_stride_y;
}
}
-int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
+// Draw a rectangle into I420
+LIBYUV_API
+int I420Rect(uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int x, int y,
+ int width, int height,
+ int value_y, int value_u, int value_v) {
+ if (!dst_y || !dst_u || !dst_v ||
+ width <= 0 || height <= 0 ||
+ x < 0 || y < 0 ||
+ value_y < 0 || value_y > 255 ||
+ value_u < 0 || value_u > 255 ||
+ value_v < 0 || value_v > 255) {
+ return -1;
+ }
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ uint8* start_y = dst_y + y * dst_stride_y + x;
+ uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
+ uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
+
+ SetPlane(start_y, dst_stride_y, width, height, value_y);
+ SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u);
+ SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v);
+ return 0;
+}
+
+// Draw a rectangle into ARGB
+LIBYUV_API
+int ARGBRect(uint8* dst_argb, int dst_stride_argb,
+ int dst_x, int dst_y,
+ int width, int height,
+ uint32 value) {
+ if (!dst_argb ||
+ width <= 0 || height <= 0 ||
+ dst_x < 0 || dst_y < 0) {
+ return -1;
+ }
+ uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+#if defined(HAS_SETROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ SetRows32_NEON(dst, value, width, dst_stride_argb, height);
+ return 0;
+ }
+#endif
+#if defined(HAS_SETROW_X86)
+ if (TestCpuFlag(kCpuHasX86)) {
+ SetRows32_X86(dst, value, width, dst_stride_argb, height);
+ return 0;
+ }
+#endif
+ SetRows32_C(dst, value, width, dst_stride_argb, height);
+ return 0;
+}
+
+// Convert unattentuated ARGB to preattenuated ARGB.
+// An unattenutated ARGB alpha blend uses the formula
+// p = a * f + (1 - a) * b
+// where
+// p is output pixel
+// f is foreground pixel
+// b is background pixel
+// a is alpha value from foreground pixel
+// An preattenutated ARGB alpha blend uses the formula
+// p = f + (1 - a) * b
+// where
+// f is foreground pixel premultiplied by alpha
+
+LIBYUV_API
+int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
if (height < 0) {
height = -height;
- src_abgr = src_abgr + (height - 1) * src_stride_abgr;
- src_stride_abgr = -src_stride_abgr;
- }
-void (*ABGRToARGBRow)(const uint8* src_abgr, uint8* dst_argb, int pix);
-#if defined(HAS_ABGRTOARGBROW_SSSE3)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
- (width % 4 == 0) &&
- IS_ALIGNED(src_abgr, 16) && (src_stride_abgr % 16 == 0) &&
- IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
- ABGRToARGBRow = ABGRToARGBRow_SSSE3;
- } else
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
+ int width) = ARGBAttenuateRow_C;
+#if defined(HAS_ARGBATTENUATE_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_SSE2;
+ }
#endif
- {
- ABGRToARGBRow = ABGRToARGBRow_C;
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
}
+#endif
for (int y = 0; y < height; ++y) {
- ABGRToARGBRow(src_abgr, dst_argb, width);
- src_abgr += src_stride_abgr;
+ ARGBAttenuateRow(src_argb, dst_argb, width);
+ src_argb += src_stride_argb;
dst_argb += dst_stride_argb;
}
return 0;
}
-static void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix) {
- for (int x = 0; x < pix; ++x) {
- // To support in-place conversion.
- uint8 a = src_bgra[0];
- uint8 r = src_bgra[1];
- uint8 g = src_bgra[2];
- uint8 b = src_bgra[3];
- dst_argb[0] = b;
- dst_argb[1] = g;
- dst_argb[2] = r;
- dst_argb[3] = a;
- dst_argb += 4;
- src_bgra += 4;
+// Convert preattentuated ARGB to unattenuated ARGB.
+LIBYUV_API
+int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
}
-}
-
-// Convert BGRA to ARGB.
-int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
if (height < 0) {
height = -height;
- src_bgra = src_bgra + (height - 1) * src_stride_bgra;
- src_stride_bgra = -src_stride_bgra;
- }
- void (*BGRAToARGBRow)(const uint8* src_bgra, uint8* dst_argb, int pix);
-#if defined(HAS_BGRATOARGBROW_SSSE3)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
- (width % 4 == 0) &&
- IS_ALIGNED(src_bgra, 16) && (src_stride_bgra % 16 == 0) &&
- IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
- BGRAToARGBRow = BGRAToARGBRow_SSSE3;
- } else
-#endif
- {
- BGRAToARGBRow = BGRAToARGBRow_C;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
}
+ void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb,
+ int width) = ARGBUnattenuateRow_C;
+#if defined(HAS_ARGBUNATTENUATEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2;
+ }
+#endif
for (int y = 0; y < height; ++y) {
- BGRAToARGBRow(src_bgra, dst_argb, width);
- src_bgra += src_stride_bgra;
+ ARGBUnattenuateRow(src_argb, dst_argb, width);
+ src_argb += src_stride_argb;
dst_argb += dst_stride_argb;
}
return 0;
}
-// Convert ARGB to I400.
-int ARGBToI400(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
+// Convert ARGB to Grayed ARGB.
+LIBYUV_API
+int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
int width, int height) {
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
if (height < 0) {
height = -height;
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
-void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
-#if defined(HAS_ARGBTOYROW_SSSE3)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
- (width % 4 == 0) &&
- IS_ALIGNED(src_argb, 16) && (src_stride_argb % 16 == 0) &&
- IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- } else
+ void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
+ int width) = ARGBGrayRow_C;
+#if defined(HAS_ARGBGRAYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGBGrayRow = ARGBGrayRow_SSSE3;
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ ARGBGrayRow(src_argb, dst_argb, width);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Make a rectangle of ARGB gray scale.
+LIBYUV_API
+int ARGBGray(uint8* dst_argb, int dst_stride_argb,
+ int dst_x, int dst_y,
+ int width, int height) {
+ if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
+ return -1;
+ }
+ void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
+ int width) = ARGBGrayRow_C;
+#if defined(HAS_ARGBGRAYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGBGrayRow = ARGBGrayRow_SSSE3;
+ }
#endif
- {
- ARGBToYRow = ARGBToYRow_C;
+ uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ for (int y = 0; y < height; ++y) {
+ ARGBGrayRow(dst, dst, width);
+ dst += dst_stride_argb;
}
+ return 0;
+}
+// Make a rectangle of ARGB Sepia tone.
+LIBYUV_API
+int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
+ int dst_x, int dst_y, int width, int height) {
+ if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
+ return -1;
+ }
+ void (*ARGBSepiaRow)(uint8* dst_argb, int width) = ARGBSepiaRow_C;
+#if defined(HAS_ARGBSEPIAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGBSepiaRow = ARGBSepiaRow_SSSE3;
+ }
+#endif
+ uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
for (int y = 0; y < height; ++y) {
- ARGBToYRow(src_argb, dst_y, width);
+ ARGBSepiaRow(dst, width);
+ dst += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Apply a 4x3 matrix rotation to each ARGB pixel.
+LIBYUV_API
+int ARGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+ const int8* matrix_argb,
+ int dst_x, int dst_y, int width, int height) {
+ if (!dst_argb || !matrix_argb || width <= 0 || height <= 0 ||
+ dst_x < 0 || dst_y < 0) {
+ return -1;
+ }
+ void (*ARGBColorMatrixRow)(uint8* dst_argb, const int8* matrix_argb,
+ int width) = ARGBColorMatrixRow_C;
+#if defined(HAS_ARGBCOLORMATRIXROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3;
+ }
+#endif
+ uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ for (int y = 0; y < height; ++y) {
+ ARGBColorMatrixRow(dst, matrix_argb, width);
+ dst += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Apply a color table each ARGB pixel.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
+ const uint8* table_argb,
+ int dst_x, int dst_y, int width, int height) {
+ if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
+ dst_x < 0 || dst_y < 0) {
+ return -1;
+ }
+ void (*ARGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
+ int width) = ARGBColorTableRow_C;
+#if defined(HAS_ARGBCOLORTABLEROW_X86)
+ if (TestCpuFlag(kCpuHasX86)) {
+ ARGBColorTableRow = ARGBColorTableRow_X86;
+ }
+#endif
+ uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ for (int y = 0; y < height; ++y) {
+ ARGBColorTableRow(dst, table_argb, width);
+ dst += dst_stride_argb;
+ }
+ return 0;
+}
+
+// ARGBQuantize is used to posterize art.
+// e.g. rgb / qvalue * qvalue + qvalue / 2
+// But the low levels implement efficiently with 3 parameters, and could be
+// used for other high level operations.
+// The divide is replaces with a multiply by reciprocal fixed point multiply.
+// Caveat - although SSE2 saturates, the C function does not and should be used
+// with care if doing anything but quantization.
+LIBYUV_API
+int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
+ int scale, int interval_size, int interval_offset,
+ int dst_x, int dst_y, int width, int height) {
+ if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 ||
+ interval_size < 1 || interval_size > 255) {
+ return -1;
+ }
+ void (*ARGBQuantizeRow)(uint8* dst_argb, int scale, int interval_size,
+ int interval_offset, int width) = ARGBQuantizeRow_C;
+#if defined(HAS_ARGBQUANTIZEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGBQuantizeRow = ARGBQuantizeRow_SSE2;
+ }
+#endif
+ uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ for (int y = 0; y < height; ++y) {
+ ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width);
+ dst += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Computes table of cumulative sum for image where the value is the sum
+// of all values above and to the left of the entry. Used by ARGBBlur.
+LIBYUV_API
+int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
+ int32* dst_cumsum, int dst_stride32_cumsum,
+ int width, int height) {
+ if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) {
+ return -1;
+ }
+ void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
+ const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
+#if defined(HAS_CUMULATIVESUMTOAVERAGE_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
+ }
+#endif
+ memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4); // 4 int per pixel.
+ int32* previous_cumsum = dst_cumsum;
+ for (int y = 0; y < height; ++y) {
+ ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width);
+ previous_cumsum = dst_cumsum;
+ dst_cumsum += dst_stride32_cumsum;
src_argb += src_stride_argb;
- dst_y += dst_stride_y;
}
return 0;
}
+// Blur ARGB image.
+// Caller should allocate CumulativeSum table of width * height * 16 bytes
+// aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory
+// as the buffer is treated as circular.
+LIBYUV_API
+int ARGBBlur(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int32* dst_cumsum, int dst_stride32_cumsum,
+ int width, int height, int radius) {
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
+ const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
+ void (*CumulativeSumToAverage)(const int32* topleft, const int32* botleft,
+ int width, int area, uint8* dst, int count) = CumulativeSumToAverage_C;
+#if defined(HAS_CUMULATIVESUMTOAVERAGE_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
+ CumulativeSumToAverage = CumulativeSumToAverage_SSE2;
+ }
+#endif
+ // Compute enough CumulativeSum for first row to be blurred. After this
+ // one row of CumulativeSum is updated at a time.
+ ARGBComputeCumulativeSum(src_argb, src_stride_argb,
+ dst_cumsum, dst_stride32_cumsum,
+ width, radius);
-// Convert RAW to ARGB.
-int RAWToARGB(const uint8* src_raw, int src_stride_raw,
+ src_argb = src_argb + radius * src_stride_argb;
+ int32* cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum];
+
+ const int32* max_cumsum_bot_row =
+ &dst_cumsum[(radius * 2 + 2) * dst_stride32_cumsum];
+ const int32* cumsum_top_row = &dst_cumsum[0];
+
+ for (int y = 0; y < height; ++y) {
+ int top_y = ((y - radius - 1) >= 0) ? (y - radius - 1) : 0;
+ int bot_y = ((y + radius) < height) ? (y + radius) : (height - 1);
+ int area = radius * (bot_y - top_y);
+
+ // Increment cumsum_top_row pointer with circular buffer wrap around.
+ if (top_y) {
+ cumsum_top_row += dst_stride32_cumsum;
+ if (cumsum_top_row >= max_cumsum_bot_row) {
+ cumsum_top_row = dst_cumsum;
+ }
+ }
+ // Increment cumsum_bot_row pointer with circular buffer wrap around and
+ // then fill in a row of CumulativeSum.
+ if ((y + radius) < height) {
+ const int32* prev_cumsum_bot_row = cumsum_bot_row;
+ cumsum_bot_row += dst_stride32_cumsum;
+ if (cumsum_bot_row >= max_cumsum_bot_row) {
+ cumsum_bot_row = dst_cumsum;
+ }
+ ComputeCumulativeSumRow(src_argb, cumsum_bot_row, prev_cumsum_bot_row,
+ width);
+ src_argb += src_stride_argb;
+ }
+
+ // Left clipped.
+ int boxwidth = radius * 4;
+ int x;
+ for (x = 0; x < radius + 1; ++x) {
+ CumulativeSumToAverage(cumsum_top_row, cumsum_bot_row,
+ boxwidth, area, &dst_argb[x * 4], 1);
+ area += (bot_y - top_y);
+ boxwidth += 4;
+ }
+
+ // Middle unclipped.
+ int n = (width - 1) - radius - x + 1;
+ CumulativeSumToAverage(cumsum_top_row, cumsum_bot_row,
+ boxwidth, area, &dst_argb[x * 4], n);
+
+ // Right clipped.
+ for (x += n; x <= width - 1; ++x) {
+ area -= (bot_y - top_y);
+ boxwidth -= 4;
+ CumulativeSumToAverage(cumsum_top_row + (x - radius - 1) * 4,
+ cumsum_bot_row + (x - radius - 1) * 4,
+ boxwidth, area, &dst_argb[x * 4], 1);
+ }
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Multiply ARGB image by a specified ARGB value.
+LIBYUV_API
+int ARGBShade(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
+ int width, int height, uint32 value) {
+ if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) {
+ return -1;
+ }
if (height < 0) {
height = -height;
- src_raw = src_raw + (height - 1) * src_stride_raw;
- src_stride_raw = -src_stride_raw;
- }
- void (*RAWToARGBRow)(const uint8* src_raw, uint8* dst_argb, int pix);
-#if defined(HAS_RAWTOARGBROW_SSSE3)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
- (width % 16 == 0) &&
- IS_ALIGNED(src_raw, 16) && (src_stride_raw % 16 == 0) &&
- IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
- RAWToARGBRow = RAWToARGBRow_SSSE3;
- } else
-#endif
- {
- RAWToARGBRow = RAWToARGBRow_C;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
}
+ void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb,
+ int width, uint32 value) = ARGBShadeRow_C;
+#if defined(HAS_ARGBSHADE_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGBShadeRow = ARGBShadeRow_SSE2;
+ }
+#endif
for (int y = 0; y < height; ++y) {
- RAWToARGBRow(src_raw, dst_argb, width);
- src_raw += src_stride_raw;
+ ARGBShadeRow(src_argb, dst_argb, width, value);
+ src_argb += src_stride_argb;
dst_argb += dst_stride_argb;
}
return 0;
}
-// Convert BG24 to ARGB.
-int BG24ToARGB(const uint8* src_bg24, int src_stride_bg24,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
+// Interpolate 2 ARGB images by specified amount (0 to 255).
+LIBYUV_API
+int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
+ const uint8* src_argb1, int src_stride_argb1,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height, int interpolation) {
+ if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
if (height < 0) {
height = -height;
- src_bg24 = src_bg24 + (height - 1) * src_stride_bg24;
- src_stride_bg24 = -src_stride_bg24;
- }
- void (*BG24ToARGBRow)(const uint8* src_bg24, uint8* dst_argb, int pix);
-#if defined(HAS_BG24TOARGBROW_SSSE3)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
- (width % 16 == 0) &&
- IS_ALIGNED(src_bg24, 16) && (src_stride_bg24 % 16 == 0) &&
- IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
- BG24ToARGBRow = BG24ToARGBRow_SSSE3;
- } else
-#endif
- {
- BG24ToARGBRow = BG24ToARGBRow_C;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
}
-
+ void (*ARGBInterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = ARGBInterpolateRow_C;
+#if defined(HAS_ARGBINTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) &&
+ IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
+ IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGBInterpolateRow = ARGBInterpolateRow_SSSE3;
+ }
+#endif
for (int y = 0; y < height; ++y) {
- BG24ToARGBRow(src_bg24, dst_argb, width);
- src_bg24 += src_stride_bg24;
+ ARGBInterpolateRow(dst_argb, src_argb0, src_argb1 - src_argb0,
+ width, interpolation);
+ src_argb0 += src_stride_argb0;
+ src_argb1 += src_stride_argb1;
dst_argb += dst_stride_argb;
}
return 0;
}
+#ifdef __cplusplus
+} // extern "C"
} // namespace libyuv
-
+#endif
diff --git a/files/source/rotate.cc b/files/source/rotate.cc
index 12cdd7e1..cac3fa0b 100644
--- a/files/source/rotate.cc
+++ b/files/source/rotate.cc
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@@ -8,49 +8,44 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "libyuv/planar_functions.h"
#include "libyuv/rotate.h"
-#include "rotate_priv.h"
#include "libyuv/cpu_id.h"
+#include "libyuv/convert.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/row.h"
+#ifdef __cplusplus
namespace libyuv {
+extern "C" {
+#endif
-#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \
- && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
-#if defined(_MSC_VER)
-#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
+#if !defined(YUV_DISABLE_ASM) && \
+ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#if defined(__APPLE__) && defined(__i386__)
+#define DECLARE_FUNCTION(name) \
+ ".text \n" \
+ ".private_extern _" #name " \n" \
+ ".align 4,0x90 \n" \
+"_" #name ": \n"
+#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
+#define DECLARE_FUNCTION(name) \
+ ".text \n" \
+ ".align 4,0x90 \n" \
+"_" #name ": \n"
#else
-#define TALIGN16(t, var) t var __attribute__((aligned(16)))
+#define DECLARE_FUNCTION(name) \
+ ".text \n" \
+ ".align 4,0x90 \n" \
+#name ": \n"
#endif
-// Shuffle table for reversing the bytes.
-extern "C" TALIGN16(const uint8, kShuffleReverse[16]) =
- { 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u };
-// Shuffle table for reversing the bytes of UV channels.
-extern "C" TALIGN16(const uint8, kShuffleReverseUV[16]) =
- { 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u };
#endif
-typedef void (*reverse_uv_func)(const uint8*, uint8*, uint8*, int);
-typedef void (*reverse_func)(const uint8*, uint8*, int);
-typedef void (*rotate_uv_wx8_func)(const uint8*, int,
- uint8*, int,
- uint8*, int, int);
-typedef void (*rotate_uv_wxh_func)(const uint8*, int,
- uint8*, int,
- uint8*, int, int, int);
-typedef void (*rotate_wx8_func)(const uint8*, int, uint8*, int, int);
-typedef void (*rotate_wxh_func)(const uint8*, int, uint8*, int, int, int);
-
-#if 0 // Need to add rotate_neon.s to the build to enable this
-#ifdef __ARM_NEON__
-extern "C" {
-void RestoreRegisters_NEON(unsigned long long *restore);
-void SaveRegisters_NEON(unsigned long long *store);
-#define HAS_REVERSE_LINE_NEON
-void ReverseLine_NEON(const uint8* src, uint8* dst, int width);
-#define HAS_REVERSE_LINE_UV_NEON
-void ReverseLineUV_NEON(const uint8* src,
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
+#define HAS_MIRRORROW_NEON
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
+#define HAS_MIRRORROW_UV_NEON
+void MirrorRowUV_NEON(const uint8* src,
uint8* dst_a, uint8* dst_b,
int width);
#define HAS_TRANSPOSE_WX8_NEON
@@ -61,16 +56,14 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width);
-} // extern "C"
-#endif
-#endif
+#endif // defined(__ARM_NEON__)
-#if defined(WIN32) && !defined(COVERAGE_ENABLED)
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_TRANSPOSE_WX8_SSSE3
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
-__asm {
+ __asm {
push edi
push esi
push ebp
@@ -79,9 +72,11 @@ __asm {
mov edx, [esp + 12 + 12] // dst
mov esi, [esp + 12 + 16] // dst_stride
mov ecx, [esp + 12 + 20] // width
- convertloop :
+
// Read in the data from the source pointer.
// First round of bit swap.
+ align 16
+ convertloop:
movq xmm0, qword ptr [eax]
lea ebp, [eax + 8]
movq xmm1, qword ptr [eax + edi]
@@ -144,10 +139,10 @@ __asm {
movq qword ptr [edx], xmm3
movdqa xmm7, xmm3
palignr xmm7, xmm7, 8
+ sub ecx, 8
movq qword ptr [edx + esi], xmm7
lea edx, [edx + 2 * esi]
- sub ecx, 8
- ja convertloop
+ jg convertloop
pop ebp
pop esi
@@ -157,12 +152,12 @@ __asm {
}
#define HAS_TRANSPOSE_UVWX8_SSE2
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int w) {
-__asm {
+ __asm {
push ebx
push esi
push edi
@@ -178,7 +173,9 @@ __asm {
and esp, ~15
mov [esp + 16], ecx
mov ecx, [ecx + 16 + 28] // w
- convertloop :
+
+ align 16
+ convertloop:
// Read in the data from the source pointer.
// First round of bit swap.
movdqa xmm0, [eax]
@@ -268,12 +265,12 @@ __asm {
movlpd qword ptr [edx], xmm3
movhpd qword ptr [ebx], xmm3
punpckhdq xmm0, xmm7
+ sub ecx, 8
movlpd qword ptr [edx + esi], xmm0
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm0
lea ebx, [ebx + 2 * ebp]
- sub ecx, 8
- ja convertloop
+ jg convertloop
mov esp, [esp + 16]
pop ebp
@@ -283,356 +280,355 @@ __asm {
ret
}
}
-#elif (defined(__i386__) || defined(__x86_64__)) && \
- !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+#elif !defined(YUV_DISABLE_ASM) && (defined(__i386__) || defined(__x86_64__))
#define HAS_TRANSPOSE_WX8_SSSE3
static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
- asm volatile(
-"1:"
- // Read in the data from the source pointer.
- // First round of bit swap.
- "movq (%0),%%xmm0\n"
- "movq (%0,%3),%%xmm1\n"
- "lea (%0,%3,2),%0\n"
- "punpcklbw %%xmm1,%%xmm0\n"
- "movq (%0),%%xmm2\n"
- "movdqa %%xmm0,%%xmm1\n"
- "palignr $0x8,%%xmm1,%%xmm1\n"
- "movq (%0,%3),%%xmm3\n"
- "lea (%0,%3,2),%0\n"
- "punpcklbw %%xmm3,%%xmm2\n"
- "movdqa %%xmm2,%%xmm3\n"
- "movq (%0),%%xmm4\n"
- "palignr $0x8,%%xmm3,%%xmm3\n"
- "movq (%0,%3),%%xmm5\n"
- "lea (%0,%3,2),%0\n"
- "punpcklbw %%xmm5,%%xmm4\n"
- "movdqa %%xmm4,%%xmm5\n"
- "movq (%0),%%xmm6\n"
- "palignr $0x8,%%xmm5,%%xmm5\n"
- "movq (%0,%3),%%xmm7\n"
- "lea (%0,%3,2),%0\n"
- "punpcklbw %%xmm7,%%xmm6\n"
- "neg %3\n"
- "movdqa %%xmm6,%%xmm7\n"
- "lea 0x8(%0,%3,8),%0\n"
- "palignr $0x8,%%xmm7,%%xmm7\n"
- "neg %3\n"
- // Second round of bit swap.
- "punpcklwd %%xmm2,%%xmm0\n"
- "punpcklwd %%xmm3,%%xmm1\n"
- "movdqa %%xmm0,%%xmm2\n"
- "movdqa %%xmm1,%%xmm3\n"
- "palignr $0x8,%%xmm2,%%xmm2\n"
- "palignr $0x8,%%xmm3,%%xmm3\n"
- "punpcklwd %%xmm6,%%xmm4\n"
- "punpcklwd %%xmm7,%%xmm5\n"
- "movdqa %%xmm4,%%xmm6\n"
- "movdqa %%xmm5,%%xmm7\n"
- "palignr $0x8,%%xmm6,%%xmm6\n"
- "palignr $0x8,%%xmm7,%%xmm7\n"
- // Third round of bit swap.
- // Write to the destination pointer.
- "punpckldq %%xmm4,%%xmm0\n"
- "movq %%xmm0,(%1)\n"
- "movdqa %%xmm0,%%xmm4\n"
- "palignr $0x8,%%xmm4,%%xmm4\n"
- "movq %%xmm4,(%1,%4)\n"
- "lea (%1,%4,2),%1\n"
- "punpckldq %%xmm6,%%xmm2\n"
- "movdqa %%xmm2,%%xmm6\n"
- "movq %%xmm2,(%1)\n"
- "palignr $0x8,%%xmm6,%%xmm6\n"
- "punpckldq %%xmm5,%%xmm1\n"
- "movq %%xmm6,(%1,%4)\n"
- "lea (%1,%4,2),%1\n"
- "movdqa %%xmm1,%%xmm5\n"
- "movq %%xmm1,(%1)\n"
- "palignr $0x8,%%xmm5,%%xmm5\n"
- "movq %%xmm5,(%1,%4)\n"
- "lea (%1,%4,2),%1\n"
- "punpckldq %%xmm7,%%xmm3\n"
- "movq %%xmm3,(%1)\n"
- "movdqa %%xmm3,%%xmm7\n"
- "palignr $0x8,%%xmm7,%%xmm7\n"
- "movq %%xmm7,(%1,%4)\n"
- "lea (%1,%4,2),%1\n"
- "sub $0x8,%2\n"
- "ja 1b\n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "r"(static_cast<intptr_t>(src_stride)), // %3
- "r"(static_cast<intptr_t>(dst_stride)) // %4
- : "memory"
-);
+ asm volatile (
+ // Read in the data from the source pointer.
+ // First round of bit swap.
+ ".p2align 4 \n"
+ "1: \n"
+ "movq (%0),%%xmm0 \n"
+ "movq (%0,%3),%%xmm1 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "movq (%0),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "palignr $0x8,%%xmm1,%%xmm1 \n"
+ "movq (%0,%3),%%xmm3 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "movq (%0),%%xmm4 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "movq (%0,%3),%%xmm5 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "movq (%0),%%xmm6 \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq (%0,%3),%%xmm7 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "neg %3 \n"
+ "movdqa %%xmm6,%%xmm7 \n"
+ "lea 0x8(%0,%3,8),%0 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "neg %3 \n"
+ // Second round of bit swap.
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "palignr $0x8,%%xmm2,%%xmm2 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "movdqa %%xmm5,%%xmm7 \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ // Third round of bit swap.
+ // Write to the destination pointer.
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "palignr $0x8,%%xmm4,%%xmm4 \n"
+ "movq %%xmm4,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "movq %%xmm2,(%1) \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movq %%xmm6,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "movq %%xmm1,(%1) \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq %%xmm5,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movq %%xmm3,(%1) \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "sub $0x8,%2 \n"
+ "movq %%xmm7,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"(static_cast<intptr_t>(src_stride)), // %3
+ "r"(static_cast<intptr_t>(dst_stride)) // %4
+ : "memory", "cc"
+ #if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ #endif
+ );
}
-#if defined (__i386__)
+#if !defined(YUV_DISABLE_ASM) && defined (__i386__)
#define HAS_TRANSPOSE_UVWX8_SSE2
extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int w);
- asm(
- ".text\n"
-#if defined(OSX)
- ".globl _TransposeUVWx8_SSE2\n"
-"_TransposeUVWx8_SSE2:\n"
-#else
- ".global TransposeUVWx8_SSE2\n"
-"TransposeUVWx8_SSE2:\n"
-#endif
- "push %ebx\n"
- "push %esi\n"
- "push %edi\n"
- "push %ebp\n"
- "mov 0x14(%esp),%eax\n"
- "mov 0x18(%esp),%edi\n"
- "mov 0x1c(%esp),%edx\n"
- "mov 0x20(%esp),%esi\n"
- "mov 0x24(%esp),%ebx\n"
- "mov 0x28(%esp),%ebp\n"
- "mov %esp,%ecx\n"
- "sub $0x14,%esp\n"
- "and $0xfffffff0,%esp\n"
- "mov %ecx,0x10(%esp)\n"
- "mov 0x2c(%ecx),%ecx\n"
+ asm (
+ DECLARE_FUNCTION(TransposeUVWx8_SSE2)
+ "push %ebx \n"
+ "push %esi \n"
+ "push %edi \n"
+ "push %ebp \n"
+ "mov 0x14(%esp),%eax \n"
+ "mov 0x18(%esp),%edi \n"
+ "mov 0x1c(%esp),%edx \n"
+ "mov 0x20(%esp),%esi \n"
+ "mov 0x24(%esp),%ebx \n"
+ "mov 0x28(%esp),%ebp \n"
+ "mov %esp,%ecx \n"
+ "sub $0x14,%esp \n"
+ "and $0xfffffff0,%esp \n"
+ "mov %ecx,0x10(%esp) \n"
+ "mov 0x2c(%ecx),%ecx \n"
-"1:"
- "movdqa (%eax),%xmm0\n"
- "movdqa (%eax,%edi,1),%xmm1\n"
- "lea (%eax,%edi,2),%eax\n"
- "movdqa %xmm0,%xmm7\n"
- "punpcklbw %xmm1,%xmm0\n"
- "punpckhbw %xmm1,%xmm7\n"
- "movdqa %xmm7,%xmm1\n"
- "movdqa (%eax),%xmm2\n"
- "movdqa (%eax,%edi,1),%xmm3\n"
- "lea (%eax,%edi,2),%eax\n"
- "movdqa %xmm2,%xmm7\n"
- "punpcklbw %xmm3,%xmm2\n"
- "punpckhbw %xmm3,%xmm7\n"
- "movdqa %xmm7,%xmm3\n"
- "movdqa (%eax),%xmm4\n"
- "movdqa (%eax,%edi,1),%xmm5\n"
- "lea (%eax,%edi,2),%eax\n"
- "movdqa %xmm4,%xmm7\n"
- "punpcklbw %xmm5,%xmm4\n"
- "punpckhbw %xmm5,%xmm7\n"
- "movdqa %xmm7,%xmm5\n"
- "movdqa (%eax),%xmm6\n"
- "movdqa (%eax,%edi,1),%xmm7\n"
- "lea (%eax,%edi,2),%eax\n"
- "movdqa %xmm5,(%esp)\n"
- "neg %edi\n"
- "movdqa %xmm6,%xmm5\n"
- "punpcklbw %xmm7,%xmm6\n"
- "punpckhbw %xmm7,%xmm5\n"
- "movdqa %xmm5,%xmm7\n"
- "lea 0x10(%eax,%edi,8),%eax\n"
- "neg %edi\n"
- "movdqa %xmm0,%xmm5\n"
- "punpcklwd %xmm2,%xmm0\n"
- "punpckhwd %xmm2,%xmm5\n"
- "movdqa %xmm5,%xmm2\n"
- "movdqa %xmm1,%xmm5\n"
- "punpcklwd %xmm3,%xmm1\n"
- "punpckhwd %xmm3,%xmm5\n"
- "movdqa %xmm5,%xmm3\n"
- "movdqa %xmm4,%xmm5\n"
- "punpcklwd %xmm6,%xmm4\n"
- "punpckhwd %xmm6,%xmm5\n"
- "movdqa %xmm5,%xmm6\n"
- "movdqa (%esp),%xmm5\n"
- "movdqa %xmm6,(%esp)\n"
- "movdqa %xmm5,%xmm6\n"
- "punpcklwd %xmm7,%xmm5\n"
- "punpckhwd %xmm7,%xmm6\n"
- "movdqa %xmm6,%xmm7\n"
- "movdqa %xmm0,%xmm6\n"
- "punpckldq %xmm4,%xmm0\n"
- "punpckhdq %xmm4,%xmm6\n"
- "movdqa %xmm6,%xmm4\n"
- "movdqa (%esp),%xmm6\n"
- "movlpd %xmm0,(%edx)\n"
- "movhpd %xmm0,(%ebx)\n"
- "movlpd %xmm4,(%edx,%esi,1)\n"
- "lea (%edx,%esi,2),%edx\n"
- "movhpd %xmm4,(%ebx,%ebp,1)\n"
- "lea (%ebx,%ebp,2),%ebx\n"
- "movdqa %xmm2,%xmm0\n"
- "punpckldq %xmm6,%xmm2\n"
- "movlpd %xmm2,(%edx)\n"
- "movhpd %xmm2,(%ebx)\n"
- "punpckhdq %xmm6,%xmm0\n"
- "movlpd %xmm0,(%edx,%esi,1)\n"
- "lea (%edx,%esi,2),%edx\n"
- "movhpd %xmm0,(%ebx,%ebp,1)\n"
- "lea (%ebx,%ebp,2),%ebx\n"
- "movdqa %xmm1,%xmm0\n"
- "punpckldq %xmm5,%xmm1\n"
- "movlpd %xmm1,(%edx)\n"
- "movhpd %xmm1,(%ebx)\n"
- "punpckhdq %xmm5,%xmm0\n"
- "movlpd %xmm0,(%edx,%esi,1)\n"
- "lea (%edx,%esi,2),%edx\n"
- "movhpd %xmm0,(%ebx,%ebp,1)\n"
- "lea (%ebx,%ebp,2),%ebx\n"
- "movdqa %xmm3,%xmm0\n"
- "punpckldq %xmm7,%xmm3\n"
- "movlpd %xmm3,(%edx)\n"
- "movhpd %xmm3,(%ebx)\n"
- "punpckhdq %xmm7,%xmm0\n"
- "movlpd %xmm0,(%edx,%esi,1)\n"
- "lea (%edx,%esi,2),%edx\n"
- "movhpd %xmm0,(%ebx,%ebp,1)\n"
- "lea (%ebx,%ebp,2),%ebx\n"
- "sub $0x8,%ecx\n"
- "ja 1b\n"
- "mov 0x10(%esp),%esp\n"
- "pop %ebp\n"
- "pop %edi\n"
- "pop %esi\n"
- "pop %ebx\n"
- "ret\n"
+"1: \n"
+ "movdqa (%eax),%xmm0 \n"
+ "movdqa (%eax,%edi,1),%xmm1 \n"
+ "lea (%eax,%edi,2),%eax \n"
+ "movdqa %xmm0,%xmm7 \n"
+ "punpcklbw %xmm1,%xmm0 \n"
+ "punpckhbw %xmm1,%xmm7 \n"
+ "movdqa %xmm7,%xmm1 \n"
+ "movdqa (%eax),%xmm2 \n"
+ "movdqa (%eax,%edi,1),%xmm3 \n"
+ "lea (%eax,%edi,2),%eax \n"
+ "movdqa %xmm2,%xmm7 \n"
+ "punpcklbw %xmm3,%xmm2 \n"
+ "punpckhbw %xmm3,%xmm7 \n"
+ "movdqa %xmm7,%xmm3 \n"
+ "movdqa (%eax),%xmm4 \n"
+ "movdqa (%eax,%edi,1),%xmm5 \n"
+ "lea (%eax,%edi,2),%eax \n"
+ "movdqa %xmm4,%xmm7 \n"
+ "punpcklbw %xmm5,%xmm4 \n"
+ "punpckhbw %xmm5,%xmm7 \n"
+ "movdqa %xmm7,%xmm5 \n"
+ "movdqa (%eax),%xmm6 \n"
+ "movdqa (%eax,%edi,1),%xmm7 \n"
+ "lea (%eax,%edi,2),%eax \n"
+ "movdqa %xmm5,(%esp) \n"
+ "neg %edi \n"
+ "movdqa %xmm6,%xmm5 \n"
+ "punpcklbw %xmm7,%xmm6 \n"
+ "punpckhbw %xmm7,%xmm5 \n"
+ "movdqa %xmm5,%xmm7 \n"
+ "lea 0x10(%eax,%edi,8),%eax \n"
+ "neg %edi \n"
+ "movdqa %xmm0,%xmm5 \n"
+ "punpcklwd %xmm2,%xmm0 \n"
+ "punpckhwd %xmm2,%xmm5 \n"
+ "movdqa %xmm5,%xmm2 \n"
+ "movdqa %xmm1,%xmm5 \n"
+ "punpcklwd %xmm3,%xmm1 \n"
+ "punpckhwd %xmm3,%xmm5 \n"
+ "movdqa %xmm5,%xmm3 \n"
+ "movdqa %xmm4,%xmm5 \n"
+ "punpcklwd %xmm6,%xmm4 \n"
+ "punpckhwd %xmm6,%xmm5 \n"
+ "movdqa %xmm5,%xmm6 \n"
+ "movdqa (%esp),%xmm5 \n"
+ "movdqa %xmm6,(%esp) \n"
+ "movdqa %xmm5,%xmm6 \n"
+ "punpcklwd %xmm7,%xmm5 \n"
+ "punpckhwd %xmm7,%xmm6 \n"
+ "movdqa %xmm6,%xmm7 \n"
+ "movdqa %xmm0,%xmm6 \n"
+ "punpckldq %xmm4,%xmm0 \n"
+ "punpckhdq %xmm4,%xmm6 \n"
+ "movdqa %xmm6,%xmm4 \n"
+ "movdqa (%esp),%xmm6 \n"
+ "movlpd %xmm0,(%edx) \n"
+ "movhpd %xmm0,(%ebx) \n"
+ "movlpd %xmm4,(%edx,%esi,1) \n"
+ "lea (%edx,%esi,2),%edx \n"
+ "movhpd %xmm4,(%ebx,%ebp,1) \n"
+ "lea (%ebx,%ebp,2),%ebx \n"
+ "movdqa %xmm2,%xmm0 \n"
+ "punpckldq %xmm6,%xmm2 \n"
+ "movlpd %xmm2,(%edx) \n"
+ "movhpd %xmm2,(%ebx) \n"
+ "punpckhdq %xmm6,%xmm0 \n"
+ "movlpd %xmm0,(%edx,%esi,1) \n"
+ "lea (%edx,%esi,2),%edx \n"
+ "movhpd %xmm0,(%ebx,%ebp,1) \n"
+ "lea (%ebx,%ebp,2),%ebx \n"
+ "movdqa %xmm1,%xmm0 \n"
+ "punpckldq %xmm5,%xmm1 \n"
+ "movlpd %xmm1,(%edx) \n"
+ "movhpd %xmm1,(%ebx) \n"
+ "punpckhdq %xmm5,%xmm0 \n"
+ "movlpd %xmm0,(%edx,%esi,1) \n"
+ "lea (%edx,%esi,2),%edx \n"
+ "movhpd %xmm0,(%ebx,%ebp,1) \n"
+ "lea (%ebx,%ebp,2),%ebx \n"
+ "movdqa %xmm3,%xmm0 \n"
+ "punpckldq %xmm7,%xmm3 \n"
+ "movlpd %xmm3,(%edx) \n"
+ "movhpd %xmm3,(%ebx) \n"
+ "punpckhdq %xmm7,%xmm0 \n"
+ "sub $0x8,%ecx \n"
+ "movlpd %xmm0,(%edx,%esi,1) \n"
+ "lea (%edx,%esi,2),%edx \n"
+ "movhpd %xmm0,(%ebx,%ebp,1) \n"
+ "lea (%ebx,%ebp,2),%ebx \n"
+ "jg 1b \n"
+ "mov 0x10(%esp),%esp \n"
+ "pop %ebp \n"
+ "pop %edi \n"
+ "pop %esi \n"
+ "pop %ebx \n"
+ "ret \n"
);
-#elif defined (__x86_64__)
+#elif !defined(YUV_DISABLE_ASM) && defined(__x86_64__)
// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
#define HAS_TRANSPOSE_WX8_FAST_SSSE3
static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
- asm volatile(
-"1:"
+ asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
- "movdqa (%0),%%xmm0\n"
- "movdqa (%0,%3),%%xmm1\n"
- "lea (%0,%3,2),%0\n"
- "movdqa %%xmm0,%%xmm8\n"
- "punpcklbw %%xmm1,%%xmm0\n"
- "punpckhbw %%xmm1,%%xmm8\n"
- "movdqa (%0),%%xmm2\n"
- "movdqa %%xmm0,%%xmm1\n"
- "movdqa %%xmm8,%%xmm9\n"
- "palignr $0x8,%%xmm1,%%xmm1\n"
- "palignr $0x8,%%xmm9,%%xmm9\n"
- "movdqa (%0,%3),%%xmm3\n"
- "lea (%0,%3,2),%0\n"
- "movdqa %%xmm2,%%xmm10\n"
- "punpcklbw %%xmm3,%%xmm2\n"
- "punpckhbw %%xmm3,%%xmm10\n"
- "movdqa %%xmm2,%%xmm3\n"
- "movdqa %%xmm10,%%xmm11\n"
- "movdqa (%0),%%xmm4\n"
- "palignr $0x8,%%xmm3,%%xmm3\n"
- "palignr $0x8,%%xmm11,%%xmm11\n"
- "movdqa (%0,%3),%%xmm5\n"
- "lea (%0,%3,2),%0\n"
- "movdqa %%xmm4,%%xmm12\n"
- "punpcklbw %%xmm5,%%xmm4\n"
- "punpckhbw %%xmm5,%%xmm12\n"
- "movdqa %%xmm4,%%xmm5\n"
- "movdqa %%xmm12,%%xmm13\n"
- "movdqa (%0),%%xmm6\n"
- "palignr $0x8,%%xmm5,%%xmm5\n"
- "palignr $0x8,%%xmm13,%%xmm13\n"
- "movdqa (%0,%3),%%xmm7\n"
- "lea (%0,%3,2),%0\n"
- "movdqa %%xmm6,%%xmm14\n"
- "punpcklbw %%xmm7,%%xmm6\n"
- "punpckhbw %%xmm7,%%xmm14\n"
- "neg %3\n"
- "movdqa %%xmm6,%%xmm7\n"
- "movdqa %%xmm14,%%xmm15\n"
- "lea 0x10(%0,%3,8),%0\n"
- "palignr $0x8,%%xmm7,%%xmm7\n"
- "palignr $0x8,%%xmm15,%%xmm15\n"
- "neg %3\n"
+ ".p2align 4 \n"
+"1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa (%0,%3),%%xmm1 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm8 \n"
+ "movdqa (%0),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm8,%%xmm9 \n"
+ "palignr $0x8,%%xmm1,%%xmm1 \n"
+ "palignr $0x8,%%xmm9,%%xmm9 \n"
+ "movdqa (%0,%3),%%xmm3 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm2,%%xmm10 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "punpckhbw %%xmm3,%%xmm10 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "movdqa %%xmm10,%%xmm11 \n"
+ "movdqa (%0),%%xmm4 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "palignr $0x8,%%xmm11,%%xmm11 \n"
+ "movdqa (%0,%3),%%xmm5 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm4,%%xmm12 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "punpckhbw %%xmm5,%%xmm12 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "movdqa %%xmm12,%%xmm13 \n"
+ "movdqa (%0),%%xmm6 \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "palignr $0x8,%%xmm13,%%xmm13 \n"
+ "movdqa (%0,%3),%%xmm7 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm6,%%xmm14 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "punpckhbw %%xmm7,%%xmm14 \n"
+ "neg %3 \n"
+ "movdqa %%xmm6,%%xmm7 \n"
+ "movdqa %%xmm14,%%xmm15 \n"
+ "lea 0x10(%0,%3,8),%0 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
+ "neg %3 \n"
// Second round of bit swap.
- "punpcklwd %%xmm2,%%xmm0\n"
- "punpcklwd %%xmm3,%%xmm1\n"
- "movdqa %%xmm0,%%xmm2\n"
- "movdqa %%xmm1,%%xmm3\n"
- "palignr $0x8,%%xmm2,%%xmm2\n"
- "palignr $0x8,%%xmm3,%%xmm3\n"
- "punpcklwd %%xmm6,%%xmm4\n"
- "punpcklwd %%xmm7,%%xmm5\n"
- "movdqa %%xmm4,%%xmm6\n"
- "movdqa %%xmm5,%%xmm7\n"
- "palignr $0x8,%%xmm6,%%xmm6\n"
- "palignr $0x8,%%xmm7,%%xmm7\n"
- "punpcklwd %%xmm10,%%xmm8\n"
- "punpcklwd %%xmm11,%%xmm9\n"
- "movdqa %%xmm8,%%xmm10\n"
- "movdqa %%xmm9,%%xmm11\n"
- "palignr $0x8,%%xmm10,%%xmm10\n"
- "palignr $0x8,%%xmm11,%%xmm11\n"
- "punpcklwd %%xmm14,%%xmm12\n"
- "punpcklwd %%xmm15,%%xmm13\n"
- "movdqa %%xmm12,%%xmm14\n"
- "movdqa %%xmm13,%%xmm15\n"
- "palignr $0x8,%%xmm14,%%xmm14\n"
- "palignr $0x8,%%xmm15,%%xmm15\n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "palignr $0x8,%%xmm2,%%xmm2 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "movdqa %%xmm5,%%xmm7 \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "punpcklwd %%xmm10,%%xmm8 \n"
+ "punpcklwd %%xmm11,%%xmm9 \n"
+ "movdqa %%xmm8,%%xmm10 \n"
+ "movdqa %%xmm9,%%xmm11 \n"
+ "palignr $0x8,%%xmm10,%%xmm10 \n"
+ "palignr $0x8,%%xmm11,%%xmm11 \n"
+ "punpcklwd %%xmm14,%%xmm12 \n"
+ "punpcklwd %%xmm15,%%xmm13 \n"
+ "movdqa %%xmm12,%%xmm14 \n"
+ "movdqa %%xmm13,%%xmm15 \n"
+ "palignr $0x8,%%xmm14,%%xmm14 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
// Third round of bit swap.
// Write to the destination pointer.
- "punpckldq %%xmm4,%%xmm0\n"
- "movq %%xmm0,(%1)\n"
- "movdqa %%xmm0,%%xmm4\n"
- "palignr $0x8,%%xmm4,%%xmm4\n"
- "movq %%xmm4,(%1,%4)\n"
- "lea (%1,%4,2),%1\n"
- "punpckldq %%xmm6,%%xmm2\n"
- "movdqa %%xmm2,%%xmm6\n"
- "movq %%xmm2,(%1)\n"
- "palignr $0x8,%%xmm6,%%xmm6\n"
- "punpckldq %%xmm5,%%xmm1\n"
- "movq %%xmm6,(%1,%4)\n"
- "lea (%1,%4,2),%1\n"
- "movdqa %%xmm1,%%xmm5\n"
- "movq %%xmm1,(%1)\n"
- "palignr $0x8,%%xmm5,%%xmm5\n"
- "movq %%xmm5,(%1,%4)\n"
- "lea (%1,%4,2),%1\n"
- "punpckldq %%xmm7,%%xmm3\n"
- "movq %%xmm3,(%1)\n"
- "movdqa %%xmm3,%%xmm7\n"
- "palignr $0x8,%%xmm7,%%xmm7\n"
- "movq %%xmm7,(%1,%4)\n"
- "lea (%1,%4,2),%1\n"
- "punpckldq %%xmm12,%%xmm8\n"
- "movq %%xmm8,(%1)\n"
- "movdqa %%xmm8,%%xmm12\n"
- "palignr $0x8,%%xmm12,%%xmm12\n"
- "movq %%xmm12,(%1,%4)\n"
- "lea (%1,%4,2),%1\n"
- "punpckldq %%xmm14,%%xmm10\n"
- "movdqa %%xmm10,%%xmm14\n"
- "movq %%xmm10,(%1)\n"
- "palignr $0x8,%%xmm14,%%xmm14\n"
- "punpckldq %%xmm13,%%xmm9\n"
- "movq %%xmm14,(%1,%4)\n"
- "lea (%1,%4,2),%1\n"
- "movdqa %%xmm9,%%xmm13\n"
- "movq %%xmm9,(%1)\n"
- "palignr $0x8,%%xmm13,%%xmm13\n"
- "movq %%xmm13,(%1,%4)\n"
- "lea (%1,%4,2),%1\n"
- "punpckldq %%xmm15,%%xmm11\n"
- "movq %%xmm11,(%1)\n"
- "movdqa %%xmm11,%%xmm15\n"
- "palignr $0x8,%%xmm15,%%xmm15\n"
- "movq %%xmm15,(%1,%4)\n"
- "lea (%1,%4,2),%1\n"
- "sub $0x10,%2\n"
- "ja 1b\n"
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "palignr $0x8,%%xmm4,%%xmm4 \n"
+ "movq %%xmm4,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "movq %%xmm2,(%1) \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movq %%xmm6,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "movq %%xmm1,(%1) \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq %%xmm5,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movq %%xmm3,(%1) \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "movq %%xmm7,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm12,%%xmm8 \n"
+ "movq %%xmm8,(%1) \n"
+ "movdqa %%xmm8,%%xmm12 \n"
+ "palignr $0x8,%%xmm12,%%xmm12 \n"
+ "movq %%xmm12,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm14,%%xmm10 \n"
+ "movdqa %%xmm10,%%xmm14 \n"
+ "movq %%xmm10,(%1) \n"
+ "palignr $0x8,%%xmm14,%%xmm14 \n"
+ "punpckldq %%xmm13,%%xmm9 \n"
+ "movq %%xmm14,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm9,%%xmm13 \n"
+ "movq %%xmm9,(%1) \n"
+ "palignr $0x8,%%xmm13,%%xmm13 \n"
+ "movq %%xmm13,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm15,%%xmm11 \n"
+ "movq %%xmm11,(%1) \n"
+ "movdqa %%xmm11,%%xmm15 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
+ "sub $0x10,%2 \n"
+ "movq %%xmm15,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"(static_cast<intptr_t>(src_stride)), // %3
"r"(static_cast<intptr_t>(dst_stride)) // %4
- : "memory"
+ : "memory", "cc",
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+ "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
);
}
@@ -641,98 +637,99 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int w) {
- asm volatile(
-"1:"
+ asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
- "movdqa (%0),%%xmm0\n"
- "movdqa (%0,%4),%%xmm1\n"
- "lea (%0,%4,2),%0\n"
- "movdqa %%xmm0,%%xmm8\n"
- "punpcklbw %%xmm1,%%xmm0\n"
- "punpckhbw %%xmm1,%%xmm8\n"
- "movdqa %%xmm8,%%xmm1\n"
- "movdqa (%0),%%xmm2\n"
- "movdqa (%0,%4),%%xmm3\n"
- "lea (%0,%4,2),%0\n"
- "movdqa %%xmm2,%%xmm8\n"
- "punpcklbw %%xmm3,%%xmm2\n"
- "punpckhbw %%xmm3,%%xmm8\n"
- "movdqa %%xmm8,%%xmm3\n"
- "movdqa (%0),%%xmm4\n"
- "movdqa (%0,%4),%%xmm5\n"
- "lea (%0,%4,2),%0\n"
- "movdqa %%xmm4,%%xmm8\n"
- "punpcklbw %%xmm5,%%xmm4\n"
- "punpckhbw %%xmm5,%%xmm8\n"
- "movdqa %%xmm8,%%xmm5\n"
- "movdqa (%0),%%xmm6\n"
- "movdqa (%0,%4),%%xmm7\n"
- "lea (%0,%4,2),%0\n"
- "movdqa %%xmm6,%%xmm8\n"
- "punpcklbw %%xmm7,%%xmm6\n"
- "neg %4\n"
- "lea 0x10(%0,%4,8),%0\n"
- "punpckhbw %%xmm7,%%xmm8\n"
- "movdqa %%xmm8,%%xmm7\n"
- "neg %4\n"
+ ".p2align 4 \n"
+"1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa (%0,%4),%%xmm1 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm1 \n"
+ "movdqa (%0),%%xmm2 \n"
+ "movdqa (%0,%4),%%xmm3 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm2,%%xmm8 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "punpckhbw %%xmm3,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm3 \n"
+ "movdqa (%0),%%xmm4 \n"
+ "movdqa (%0,%4),%%xmm5 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm4,%%xmm8 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "punpckhbw %%xmm5,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm5 \n"
+ "movdqa (%0),%%xmm6 \n"
+ "movdqa (%0,%4),%%xmm7 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm6,%%xmm8 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "neg %4 \n"
+ "lea 0x10(%0,%4,8),%0 \n"
+ "punpckhbw %%xmm7,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm7 \n"
+ "neg %4 \n"
// Second round of bit swap.
- "movdqa %%xmm0,%%xmm8\n"
- "movdqa %%xmm1,%%xmm9\n"
- "punpckhwd %%xmm2,%%xmm8\n"
- "punpckhwd %%xmm3,%%xmm9\n"
- "punpcklwd %%xmm2,%%xmm0\n"
- "punpcklwd %%xmm3,%%xmm1\n"
- "movdqa %%xmm8,%%xmm2\n"
- "movdqa %%xmm9,%%xmm3\n"
- "movdqa %%xmm4,%%xmm8\n"
- "movdqa %%xmm5,%%xmm9\n"
- "punpckhwd %%xmm6,%%xmm8\n"
- "punpckhwd %%xmm7,%%xmm9\n"
- "punpcklwd %%xmm6,%%xmm4\n"
- "punpcklwd %%xmm7,%%xmm5\n"
- "movdqa %%xmm8,%%xmm6\n"
- "movdqa %%xmm9,%%xmm7\n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "movdqa %%xmm1,%%xmm9 \n"
+ "punpckhwd %%xmm2,%%xmm8 \n"
+ "punpckhwd %%xmm3,%%xmm9 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm8,%%xmm2 \n"
+ "movdqa %%xmm9,%%xmm3 \n"
+ "movdqa %%xmm4,%%xmm8 \n"
+ "movdqa %%xmm5,%%xmm9 \n"
+ "punpckhwd %%xmm6,%%xmm8 \n"
+ "punpckhwd %%xmm7,%%xmm9 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm8,%%xmm6 \n"
+ "movdqa %%xmm9,%%xmm7 \n"
// Third round of bit swap.
// Write to the destination pointer.
- "movdqa %%xmm0,%%xmm8\n"
- "punpckldq %%xmm4,%%xmm0\n"
- "movlpd %%xmm0,(%1)\n" // Write back U channel
- "movhpd %%xmm0,(%2)\n" // Write back V channel
- "punpckhdq %%xmm4,%%xmm8\n"
- "movlpd %%xmm8,(%1,%5)\n"
- "lea (%1,%5,2),%1\n"
- "movhpd %%xmm8,(%2,%6)\n"
- "lea (%2,%6,2),%2\n"
- "movdqa %%xmm2,%%xmm8\n"
- "punpckldq %%xmm6,%%xmm2\n"
- "movlpd %%xmm2,(%1)\n"
- "movhpd %%xmm2,(%2)\n"
- "punpckhdq %%xmm6,%%xmm8\n"
- "movlpd %%xmm8,(%1,%5)\n"
- "lea (%1,%5,2),%1\n"
- "movhpd %%xmm8,(%2,%6)\n"
- "lea (%2,%6,2),%2\n"
- "movdqa %%xmm1,%%xmm8\n"
- "punpckldq %%xmm5,%%xmm1\n"
- "movlpd %%xmm1,(%1)\n"
- "movhpd %%xmm1,(%2)\n"
- "punpckhdq %%xmm5,%%xmm8\n"
- "movlpd %%xmm8,(%1,%5)\n"
- "lea (%1,%5,2),%1\n"
- "movhpd %%xmm8,(%2,%6)\n"
- "lea (%2,%6,2),%2\n"
- "movdqa %%xmm3,%%xmm8\n"
- "punpckldq %%xmm7,%%xmm3\n"
- "movlpd %%xmm3,(%1)\n"
- "movhpd %%xmm3,(%2)\n"
- "punpckhdq %%xmm7,%%xmm8\n"
- "movlpd %%xmm8,(%1,%5)\n"
- "lea (%1,%5,2),%1\n"
- "movhpd %%xmm8,(%2,%6)\n"
- "lea (%2,%6,2),%2\n"
- "sub $0x8,%3\n"
- "ja 1b\n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movlpd %%xmm0,(%1) \n" // Write back U channel
+ "movhpd %%xmm0,(%2) \n" // Write back V channel
+ "punpckhdq %%xmm4,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm2,%%xmm8 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movlpd %%xmm2,(%1) \n"
+ "movhpd %%xmm2,(%2) \n"
+ "punpckhdq %%xmm6,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm1,%%xmm8 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movlpd %%xmm1,(%1) \n"
+ "movhpd %%xmm1,(%2) \n"
+ "punpckhdq %%xmm5,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm3,%%xmm8 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movlpd %%xmm3,(%1) \n"
+ "movhpd %%xmm3,(%2) \n"
+ "punpckhdq %%xmm7,%%xmm8 \n"
+ "sub $0x8,%3 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst_a), // %1
"+r"(dst_b), // %2
@@ -740,7 +737,9 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
: "r"(static_cast<intptr_t>(src_stride)), // %4
"r"(static_cast<intptr_t>(dst_stride_a)), // %5
"r"(static_cast<intptr_t>(dst_stride_b)) // %6
- : "memory"
+ : "memory", "cc",
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+ "xmm8", "xmm9"
);
}
#endif
@@ -748,9 +747,8 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
static void TransposeWx8_C(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
- int w) {
- int i;
- for (i = 0; i < w; ++i) {
+ int width) {
+ for (int i = 0; i < width; ++i) {
dst[0] = src[0 * src_stride];
dst[1] = src[1 * src_stride];
dst[2] = src[2 * src_stride];
@@ -767,184 +765,143 @@ static void TransposeWx8_C(const uint8* src, int src_stride,
static void TransposeWxH_C(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
- int i, j;
- for (i = 0; i < width; ++i)
- for (j = 0; j < height; ++j)
+ for (int i = 0; i < width; ++i) {
+ for (int j = 0; j < height; ++j) {
dst[i * dst_stride + j] = src[j * src_stride + i];
+ }
+ }
}
+LIBYUV_API
void TransposePlane(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
- int i = height;
- rotate_wx8_func TransposeWx8;
- rotate_wxh_func TransposeWxH;
-
+ void (*TransposeWx8)(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width) = TransposeWx8_C;
#if defined(HAS_TRANSPOSE_WX8_NEON)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
- (width % 8 == 0) &&
- IS_ALIGNED(src, 8) && (src_stride % 8 == 0) &&
- IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) {
+ if (TestCpuFlag(kCpuHasNEON)) {
TransposeWx8 = TransposeWx8_NEON;
- TransposeWxH = TransposeWxH_C;
- } else
-#endif
-#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
- (width % 16 == 0) &&
- IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
- IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) {
- TransposeWx8 = TransposeWx8_FAST_SSSE3;
- TransposeWxH = TransposeWxH_C;
- } else
+ }
#endif
#if defined(HAS_TRANSPOSE_WX8_SSSE3)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
- (width % 8 == 0) &&
- IS_ALIGNED(src, 8) && (src_stride % 8 == 0) &&
- IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) {
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
TransposeWx8 = TransposeWx8_SSSE3;
- TransposeWxH = TransposeWxH_C;
- } else
+ }
#endif
- {
- TransposeWx8 = TransposeWx8_C;
- TransposeWxH = TransposeWxH_C;
+#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) &&
+ IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
+ TransposeWx8 = TransposeWx8_FAST_SSSE3;
}
+#endif
- // work across the source in 8x8 tiles
+ // Work across the source in 8x8 tiles
+ int i = height;
while (i >= 8) {
TransposeWx8(src, src_stride, dst, dst_stride, width);
-
- src += 8 * src_stride; // go down 8 rows
- dst += 8; // move over 8 columns
- i -= 8;
+ src += 8 * src_stride; // Go down 8 rows.
+ dst += 8; // Move over 8 columns.
+ i -= 8;
}
- TransposeWxH(src, src_stride, dst, dst_stride, width, i);
+ TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
}
+LIBYUV_API
void RotatePlane90(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
// Rotate by 90 is a transpose with the source read
- // from bottom to top. So set the source pointer to the end
+ // from bottom to top. So set the source pointer to the end
// of the buffer and flip the sign of the source stride.
src += src_stride * (height - 1);
src_stride = -src_stride;
-
TransposePlane(src, src_stride, dst, dst_stride, width, height);
}
+LIBYUV_API
void RotatePlane270(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
// Rotate by 270 is a transpose with the destination written
- // from bottom to top. So set the destination pointer to the end
+ // from bottom to top. So set the destination pointer to the end
// of the buffer and flip the sign of the destination stride.
dst += dst_stride * (width - 1);
dst_stride = -dst_stride;
-
TransposePlane(src, src_stride, dst, dst_stride, width, height);
}
-static void ReverseLine_C(const uint8* src, uint8* dst, int width) {
- int i;
- src += width - 1;
- for (i = 0; i < width; ++i) {
- dst[i] = src[0];
- --src;
- }
-}
-
-#if defined(WIN32) && !defined(COVERAGE_ENABLED)
-#define HAS_REVERSE_LINE_SSSE3
-__declspec(naked)
-static void ReverseLine_SSSE3(const uint8* src, uint8* dst, int width) {
-__asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // width
- movdqa xmm7, _kShuffleReverse
- lea eax, [eax + ecx - 16]
- convertloop :
- movdqa xmm0, [eax]
- lea eax, [eax - 16]
- pshufb xmm0, xmm7
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 16
- ja convertloop
- ret
- }
-}
-
-#elif (defined(__i386__) || defined(__x86_64__)) && \
- !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
-#define HAS_REVERSE_LINE_SSSE3
-static void ReverseLine_SSSE3(const uint8* src, uint8* dst, int width) {
- intptr_t temp_width = static_cast<intptr_t>(width);
- asm volatile(
- "movdqa (%3),%%xmm7\n"
- "lea -0x10(%0,%2,1),%0\n"
-"1:"
- "movdqa (%0),%%xmm0\n"
- "lea -0x10(%0),%0\n"
- "pshufb %%xmm7,%%xmm0\n"
- "movdqa %%xmm0,(%1)\n"
- "lea 0x10(%1),%1\n"
- "sub $0x10,%2\n"
- "ja 1b\n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(temp_width) // %2
- : "r"(kShuffleReverse) // %3
- : "memory"
-);
-}
-#endif
-
+LIBYUV_API
void RotatePlane180(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
- int i;
- reverse_func ReverseLine;
-
-#if defined(HAS_REVERSE_LINE_NEON)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
- (width % 16 == 0) &&
- IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
- IS_ALIGNED(dst, 16) && (dst_stride % 16 == 0)) {
- ReverseLine = ReverseLine_NEON;
- } else
+ void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
+#if defined(HAS_MIRRORROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MirrorRow = MirrorRow_NEON;
+ }
#endif
-#if defined(HAS_REVERSE_LINE_SSSE3)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
- (width % 16 == 0) &&
- IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
- IS_ALIGNED(dst, 16) && (dst_stride % 16 == 0)) {
- ReverseLine = ReverseLine_SSSE3;
- } else
+#if defined(HAS_MIRRORROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+ MirrorRow = MirrorRow_SSE2;
+ }
#endif
- {
- ReverseLine = ReverseLine_C;
+#if defined(HAS_MIRRORROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) &&
+ IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+ MirrorRow = MirrorRow_SSSE3;
}
- // Rotate by 180 is a mirror and vertical flip
- src += src_stride * (height - 1);
-
- for (i = 0; i < height; ++i) {
- ReverseLine(src, dst, width);
- src -= src_stride;
+#endif
+ void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_COPYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) {
+ CopyRow = CopyRow_NEON;
+ }
+#endif
+#if defined(HAS_COPYROW_X86)
+ if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
+ CopyRow = CopyRow_X86;
+ }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
+ IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+ CopyRow = CopyRow_SSE2;
+ }
+#endif
+ if (width > kMaxStride) {
+ return;
+ }
+ // Swap first and last row and mirror the content. Uses a temporary row.
+ SIMD_ALIGNED(uint8 row[kMaxStride]);
+ const uint8* src_bot = src + src_stride * (height - 1);
+ uint8* dst_bot = dst + dst_stride * (height - 1);
+ int half_height = (height + 1) >> 1;
+ // Odd height will harmlessly mirror the middle row twice.
+ for (int y = 0; y < half_height; ++y) {
+ MirrorRow(src, row, width); // Mirror first row into a buffer
+ src += src_stride;
+ MirrorRow(src_bot, dst, width); // Mirror last row into first row
dst += dst_stride;
+ CopyRow(row, dst_bot, width); // Copy first mirrored row into last
+ src_bot -= src_stride;
+ dst_bot -= dst_stride;
}
}
static void TransposeUVWx8_C(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
- int w) {
- int i;
- for (i = 0; i < w; ++i) {
+ int width) {
+ for (int i = 0; i < width; ++i) {
dst_a[0] = src[0 * src_stride + 0];
dst_b[0] = src[0 * src_stride + 1];
dst_a[1] = src[1 * src_stride + 0];
@@ -970,71 +927,55 @@ static void TransposeUVWx8_C(const uint8* src, int src_stride,
static void TransposeUVWxH_C(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
- int w, int h) {
- int i, j;
- for (i = 0; i < w * 2; i += 2)
- for (j = 0; j < h; ++j) {
+ int width, int height) {
+ for (int i = 0; i < width * 2; i += 2)
+ for (int j = 0; j < height; ++j) {
dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
}
}
+LIBYUV_API
void TransposeUV(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height) {
- int i = height;
- rotate_uv_wx8_func TransposeWx8;
- rotate_uv_wxh_func TransposeWxH;
-
+ void (*TransposeUVWx8)(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width) = TransposeUVWx8_C;
#if defined(HAS_TRANSPOSE_UVWX8_NEON)
- unsigned long long store_reg[8];
- if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON)) {
- SaveRegisters_NEON(store_reg);
- TransposeWx8 = TransposeUVWx8_NEON;
- TransposeWxH = TransposeUVWxH_C;
- } else
-#endif
-#if defined(HAS_TRANSPOSE_UVWX8_SSE2)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
- (width % 8 == 0) &&
- IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
- IS_ALIGNED(dst_a, 8) && (dst_stride_a % 8 == 0) &&
- IS_ALIGNED(dst_b, 8) && (dst_stride_b % 8 == 0)) {
- TransposeWx8 = TransposeUVWx8_SSE2;
- TransposeWxH = TransposeUVWxH_C;
- } else
-#endif
- {
- TransposeWx8 = TransposeUVWx8_C;
- TransposeWxH = TransposeUVWxH_C;
+ if (TestCpuFlag(kCpuHasNEON)) {
+ TransposeUVWx8 = TransposeUVWx8_NEON;
+ }
+#elif defined(HAS_TRANSPOSE_UVWX8_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(width, 8) &&
+ IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
+ TransposeUVWx8 = TransposeUVWx8_SSE2;
}
+#endif
- // work through the source in 8x8 tiles
+ // Work through the source in 8x8 tiles.
+ int i = height;
while (i >= 8) {
- TransposeWx8(src, src_stride,
- dst_a, dst_stride_a,
- dst_b, dst_stride_b,
- width);
-
- src += 8 * src_stride; // go down 8 rows
- dst_a += 8; // move over 8 columns
- dst_b += 8; // move over 8 columns
- i -= 8;
+ TransposeUVWx8(src, src_stride,
+ dst_a, dst_stride_a,
+ dst_b, dst_stride_b,
+ width);
+ src += 8 * src_stride; // Go down 8 rows.
+ dst_a += 8; // Move over 8 columns.
+ dst_b += 8; // Move over 8 columns.
+ i -= 8;
}
- TransposeWxH(src, src_stride,
- dst_a, dst_stride_a,
- dst_b, dst_stride_b,
- width, i);
-
-#if defined(HAS_TRANSPOSE_UVWX8_NEON)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON)) {
- RestoreRegisters_NEON(store_reg);
- }
-#endif
+ TransposeUVWxH_C(src, src_stride,
+ dst_a, dst_stride_a,
+ dst_b, dst_stride_b,
+ width, i);
}
+LIBYUV_API
void RotateUV90(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
@@ -1048,6 +989,7 @@ void RotateUV90(const uint8* src, int src_stride,
width, height);
}
+LIBYUV_API
void RotateUV270(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
@@ -1063,119 +1005,38 @@ void RotateUV270(const uint8* src, int src_stride,
width, height);
}
-#if defined(WIN32) && !defined(COVERAGE_ENABLED)
-#define HAS_REVERSE_LINE_UV_SSSE3
-__declspec(naked)
-void ReverseLineUV_SSSE3(const uint8* src,
- uint8* dst_a, uint8* dst_b,
- int width) {
-__asm {
- push edi
- mov eax, [esp + 4 + 4] // src
- mov edx, [esp + 4 + 8] // dst_a
- mov edi, [esp + 4 + 12] // dst_b
- mov ecx, [esp + 4 + 16] // width
- movdqa xmm7, _kShuffleReverseUV
- lea eax, [eax + ecx * 2 - 16]
-
- convertloop :
- movdqa xmm0, [eax]
- lea eax, [eax - 16]
- pshufb xmm0, xmm7
- movlpd qword ptr [edx], xmm0
- lea edx, [edx + 8]
- movhpd qword ptr [edi], xmm0
- lea edi, [edi + 8]
- sub ecx, 8
- ja convertloop
- pop edi
- ret
- }
-}
-
-#elif (defined(__i386__) || defined(__x86_64__)) && \
- !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
-#define HAS_REVERSE_LINE_UV_SSSE3
-void ReverseLineUV_SSSE3(const uint8* src,
- uint8* dst_a, uint8* dst_b,
- int width) {
- intptr_t temp_width = static_cast<intptr_t>(width);
- asm volatile(
- "movdqa (%4),%%xmm7\n"
- "lea -0x10(%0,%3,2),%0\n"
-"1:"
- "movdqa (%0),%%xmm0\n"
- "lea -0x10(%0),%0\n"
- "pshufb %%xmm7,%%xmm0\n"
- "movlpd %%xmm0,(%1)\n"
- "lea 0x8(%1),%1\n"
- "movhpd %%xmm0,(%2)\n"
- "lea 0x8(%2),%2\n"
- "sub $0x8,%3\n"
- "ja 1b\n"
- : "+r"(src), // %0
- "+r"(dst_a), // %1
- "+r"(dst_b), // %2
- "+r"(temp_width) // %3
- : "r"(kShuffleReverseUV) // %4
- : "memory"
-);
-}
-#endif
-
-static void ReverseLineUV_C(const uint8* src,
- uint8* dst_a, uint8* dst_b,
- int width) {
- int i;
- src += width << 1;
- for (i = 0; i < width; ++i) {
- src -= 2;
- dst_a[i] = src[0];
- dst_b[i] = src[1];
- }
-}
-
+// Rotate 180 is a horizontal and vertical flip.
+LIBYUV_API
void RotateUV180(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height) {
- int i;
- reverse_uv_func ReverseLine;
-
-#if defined(HAS_REVERSE_LINE_UV_NEON)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
- (width % 16 == 0) &&
- IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
- IS_ALIGNED(dst_a, 8) && (dst_stride_a % 8 == 0) &&
- IS_ALIGNED(dst_b, 8) && (dst_stride_b % 8 == 0) ) {
- ReverseLine = ReverseLineUV_NEON;
- } else
-#endif
-#if defined(HAS_REVERSE_LINE_UV_SSSE3)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
- (width % 16 == 0) &&
- IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
- IS_ALIGNED(dst_a, 8) && (dst_stride_a % 8 == 0) &&
- IS_ALIGNED(dst_b, 8) && (dst_stride_b % 8 == 0) ) {
- ReverseLine = ReverseLineUV_SSSE3;
- } else
-#endif
- {
- ReverseLine = ReverseLineUV_C;
+ void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
+ MirrorRowUV_C;
+#if defined(HAS_MIRRORROW_UV_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MirrorRowUV = MirrorRowUV_NEON;
}
+#elif defined(HAS_MIRRORROW_UV_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) &&
+ IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
+ MirrorRowUV = MirrorRowUV_SSSE3;
+ }
+#endif
dst_a += dst_stride_a * (height - 1);
dst_b += dst_stride_b * (height - 1);
- for (i = 0; i < height; ++i) {
- ReverseLine(src, dst_a, dst_b, width);
-
- src += src_stride; // down one line at a time
- dst_a -= dst_stride_a; // nominally up one line at a time
- dst_b -= dst_stride_b; // nominally up one line at a time
+ for (int i = 0; i < height; ++i) {
+ MirrorRowUV(src, dst_a, dst_b, width);
+ src += src_stride;
+ dst_a -= dst_stride_a;
+ dst_b -= dst_stride_b;
}
}
+LIBYUV_API
int I420Rotate(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
@@ -1184,6 +1045,10 @@ int I420Rotate(const uint8* src_y, int src_stride_y,
uint8* dst_v, int dst_stride_v,
int width, int height,
RotationMode mode) {
+ if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
+ !dst_y || !dst_u || !dst_v) {
+ return -1;
+ }
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
@@ -1248,6 +1113,7 @@ int I420Rotate(const uint8* src_y, int src_stride_y,
return -1;
}
+LIBYUV_API
int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
const uint8* src_uv, int src_stride_uv,
uint8* dst_y, int dst_stride_y,
@@ -1255,6 +1121,10 @@ int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
uint8* dst_v, int dst_stride_v,
int width, int height,
RotationMode mode) {
+ if (!src_y || !src_uv || width <= 0 || height == 0 ||
+ !dst_y || !dst_u || !dst_v) {
+ return -1;
+ }
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
@@ -1271,7 +1141,8 @@ int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
switch (mode) {
case kRotate0:
// copy frame
- return NV12ToI420(src_y, src_uv, src_stride_y,
+ return NV12ToI420(src_y, src_stride_y,
+ src_uv, src_stride_uv,
dst_y, dst_stride_y,
dst_u, dst_stride_u,
dst_v, dst_stride_v,
@@ -1309,4 +1180,7 @@ int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
return -1;
}
+#ifdef __cplusplus
+} // extern "C"
} // namespace libyuv
+#endif
diff --git a/files/source/rotate_argb.cc b/files/source/rotate_argb.cc
new file mode 100644
index 00000000..9c994467
--- /dev/null
+++ b/files/source/rotate_argb.cc
@@ -0,0 +1,175 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate.h"
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/convert.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// ARGBScale has a function to copy pixels to a row, striding each source
+// pixel by a constant.
+#if !defined(YUV_DISABLE_ASM) && (defined(_M_IX86) || \
+ defined(__x86_64__) || defined(__i386__))
+#define HAS_SCALEARGBROWDOWNEVEN_SSE2
+void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
+ int src_stepx,
+ uint8* dst_ptr, int dst_width);
+#endif
+void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,
+ int src_stepx,
+ uint8* dst_ptr, int dst_width);
+
+static void ARGBTranspose(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height) {
+ void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
+ int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C;
+#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(height, 4) && // width of dest.
+ IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+ ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
+ }
+#endif
+
+ int src_pixel_step = src_stride / 4;
+ for (int i = 0; i < width; ++i) { // column of source to row of dest.
+ ScaleARGBRowDownEven(src, 0, src_pixel_step, dst, height);
+ dst += dst_stride;
+ src += 4;
+ }
+}
+
+void ARGBRotate90(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height) {
+ // Rotate by 90 is a ARGBTranspose with the source read
+ // from bottom to top. So set the source pointer to the end
+ // of the buffer and flip the sign of the source stride.
+ src += src_stride * (height - 1);
+ src_stride = -src_stride;
+ ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
+}
+
+void ARGBRotate270(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height) {
+ // Rotate by 270 is a ARGBTranspose with the destination written
+ // from bottom to top. So set the destination pointer to the end
+ // of the buffer and flip the sign of the destination stride.
+ dst += dst_stride * (width - 1);
+ dst_stride = -dst_stride;
+ ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
+}
+
+void ARGBRotate180(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height) {
+ void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
+ ARGBMirrorRow_C;
+#if defined(HAS_ARGBMIRRORROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+ ARGBMirrorRow = ARGBMirrorRow_SSSE3;
+ }
+#endif
+ void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_COPYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width * 4, 64)) {
+ CopyRow = CopyRow_NEON;
+ }
+#endif
+#if defined(HAS_COPYROW_X86)
+ if (TestCpuFlag(kCpuHasX86)) {
+ CopyRow = CopyRow_X86;
+ }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width * 4, 32) &&
+ IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+ CopyRow = CopyRow_SSE2;
+ }
+#endif
+ if (width * 4 > kMaxStride) {
+ return;
+ }
+ // Swap first and last row and mirror the content. Uses a temporary row.
+ SIMD_ALIGNED(uint8 row[kMaxStride]);
+ const uint8* src_bot = src + src_stride * (height - 1);
+ uint8* dst_bot = dst + dst_stride * (height - 1);
+ int half_height = (height + 1) >> 1;
+ // Odd height will harmlessly mirror the middle row twice.
+ for (int y = 0; y < half_height; ++y) {
+ ARGBMirrorRow(src, row, width); // Mirror first row into a buffer
+ src += src_stride;
+ ARGBMirrorRow(src_bot, dst, width); // Mirror last row into first row
+ dst += dst_stride;
+ CopyRow(row, dst_bot, width * 4); // Copy first mirrored row into last
+ src_bot -= src_stride;
+ dst_bot -= dst_stride;
+ }
+}
+
+LIBYUV_API
+int ARGBRotate(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height,
+ RotationMode mode) {
+ if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
+ return -1;
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+
+ switch (mode) {
+ case kRotate0:
+ // copy frame
+ return ARGBCopy(src_argb, src_stride_argb,
+ dst_argb, dst_stride_argb,
+ width, height);
+ case kRotate90:
+ ARGBRotate90(src_argb, src_stride_argb,
+ dst_argb, dst_stride_argb,
+ width, height);
+ return 0;
+ case kRotate270:
+ ARGBRotate270(src_argb, src_stride_argb,
+ dst_argb, dst_stride_argb,
+ width, height);
+ return 0;
+ case kRotate180:
+ ARGBRotate180(src_argb, src_stride_argb,
+ dst_argb, dst_stride_argb,
+ width, height);
+ return 0;
+ default:
+ break;
+ }
+ return -1;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/rotate_neon.cc b/files/source/rotate_neon.cc
new file mode 100644
index 00000000..49b30032
--- /dev/null
+++ b/files/source/rotate_neon.cc
@@ -0,0 +1,406 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
+
+static const uvec8 kVTbl4x4Transpose =
+ { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
+
+void TransposeWx8_NEON(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width) {
+ asm volatile (
+ // loops are on blocks of 8. loop will stop when
+ // counter gets to or below 0. starting the counter
+ // at w-8 allow for this
+ "sub %4, #8 \n"
+
+ // handle 8x8 blocks. this should be the majority of the plane
+ ".p2align 4 \n"
+ "1: \n"
+ "mov r9, %0 \n"
+
+ "vld1.8 {d0}, [r9], %1 \n"
+ "vld1.8 {d1}, [r9], %1 \n"
+ "vld1.8 {d2}, [r9], %1 \n"
+ "vld1.8 {d3}, [r9], %1 \n"
+ "vld1.8 {d4}, [r9], %1 \n"
+ "vld1.8 {d5}, [r9], %1 \n"
+ "vld1.8 {d6}, [r9], %1 \n"
+ "vld1.8 {d7}, [r9] \n"
+
+ "vtrn.8 d1, d0 \n"
+ "vtrn.8 d3, d2 \n"
+ "vtrn.8 d5, d4 \n"
+ "vtrn.8 d7, d6 \n"
+
+ "vtrn.16 d1, d3 \n"
+ "vtrn.16 d0, d2 \n"
+ "vtrn.16 d5, d7 \n"
+ "vtrn.16 d4, d6 \n"
+
+ "vtrn.32 d1, d5 \n"
+ "vtrn.32 d0, d4 \n"
+ "vtrn.32 d3, d7 \n"
+ "vtrn.32 d2, d6 \n"
+
+ "vrev16.8 q0, q0 \n"
+ "vrev16.8 q1, q1 \n"
+ "vrev16.8 q2, q2 \n"
+ "vrev16.8 q3, q3 \n"
+
+ "mov r9, %2 \n"
+
+ "vst1.8 {d1}, [r9], %3 \n"
+ "vst1.8 {d0}, [r9], %3 \n"
+ "vst1.8 {d3}, [r9], %3 \n"
+ "vst1.8 {d2}, [r9], %3 \n"
+ "vst1.8 {d5}, [r9], %3 \n"
+ "vst1.8 {d4}, [r9], %3 \n"
+ "vst1.8 {d7}, [r9], %3 \n"
+ "vst1.8 {d6}, [r9] \n"
+
+ "add %0, #8 \n" // src += 8
+ "add %2, %2, %3, lsl #3 \n" // dst += 8 * dst_stride
+ "subs %4, #8 \n" // w -= 8
+ "bge 1b \n"
+
+ // add 8 back to counter. if the result is 0 there are
+ // no residuals.
+ "adds %4, #8 \n"
+ "beq 4f \n"
+
+ // some residual, so between 1 and 7 lines left to transpose
+ "cmp %4, #2 \n"
+ "blt 3f \n"
+
+ "cmp %4, #4 \n"
+ "blt 2f \n"
+
+ // 4x8 block
+ "mov r9, %0 \n"
+ "vld1.32 {d0[0]}, [r9], %1 \n"
+ "vld1.32 {d0[1]}, [r9], %1 \n"
+ "vld1.32 {d1[0]}, [r9], %1 \n"
+ "vld1.32 {d1[1]}, [r9], %1 \n"
+ "vld1.32 {d2[0]}, [r9], %1 \n"
+ "vld1.32 {d2[1]}, [r9], %1 \n"
+ "vld1.32 {d3[0]}, [r9], %1 \n"
+ "vld1.32 {d3[1]}, [r9] \n"
+
+ "mov r9, %2 \n"
+
+ "vld1.8 {q3}, [%5] \n"
+
+ "vtbl.8 d4, {d0, d1}, d6 \n"
+ "vtbl.8 d5, {d0, d1}, d7 \n"
+ "vtbl.8 d0, {d2, d3}, d6 \n"
+ "vtbl.8 d1, {d2, d3}, d7 \n"
+
+ // TODO: rework shuffle above to write
+ // out with 4 instead of 8 writes
+ "vst1.32 {d4[0]}, [r9], %3 \n"
+ "vst1.32 {d4[1]}, [r9], %3 \n"
+ "vst1.32 {d5[0]}, [r9], %3 \n"
+ "vst1.32 {d5[1]}, [r9] \n"
+
+ "add r9, %2, #4 \n"
+ "vst1.32 {d0[0]}, [r9], %3 \n"
+ "vst1.32 {d0[1]}, [r9], %3 \n"
+ "vst1.32 {d1[0]}, [r9], %3 \n"
+ "vst1.32 {d1[1]}, [r9] \n"
+
+ "add %0, #4 \n" // src += 4
+ "add %2, %2, %3, lsl #2 \n" // dst += 4 * dst_stride
+ "subs %4, #4 \n" // w -= 4
+ "beq 4f \n"
+
+ // some residual, check to see if it includes a 2x8 block,
+ // or less
+ "cmp %4, #2 \n"
+ "blt 3f \n"
+
+ // 2x8 block
+ "2: \n"
+ "mov r9, %0 \n"
+ "vld1.16 {d0[0]}, [r9], %1 \n"
+ "vld1.16 {d1[0]}, [r9], %1 \n"
+ "vld1.16 {d0[1]}, [r9], %1 \n"
+ "vld1.16 {d1[1]}, [r9], %1 \n"
+ "vld1.16 {d0[2]}, [r9], %1 \n"
+ "vld1.16 {d1[2]}, [r9], %1 \n"
+ "vld1.16 {d0[3]}, [r9], %1 \n"
+ "vld1.16 {d1[3]}, [r9] \n"
+
+ "vtrn.8 d0, d1 \n"
+
+ "mov r9, %2 \n"
+
+ "vst1.64 {d0}, [r9], %3 \n"
+ "vst1.64 {d1}, [r9] \n"
+
+ "add %0, #2 \n" // src += 2
+ "add %2, %2, %3, lsl #1 \n" // dst += 2 * dst_stride
+ "subs %4, #2 \n" // w -= 2
+ "beq 4f \n"
+
+ // 1x8 block
+ "3: \n"
+ "vld1.8 {d0[0]}, [%0], %1 \n"
+ "vld1.8 {d0[1]}, [%0], %1 \n"
+ "vld1.8 {d0[2]}, [%0], %1 \n"
+ "vld1.8 {d0[3]}, [%0], %1 \n"
+ "vld1.8 {d0[4]}, [%0], %1 \n"
+ "vld1.8 {d0[5]}, [%0], %1 \n"
+ "vld1.8 {d0[6]}, [%0], %1 \n"
+ "vld1.8 {d0[7]}, [%0] \n"
+
+ "vst1.64 {d0}, [%2] \n"
+
+ "4: \n"
+
+ : "+r"(src), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_stride), // %3
+ "+r"(width) // %4
+ : "r"(&kVTbl4x4Transpose) // %5
+ : "memory", "cc", "r9", "q0", "q1", "q2", "q3"
+ );
+}
+
+static const uvec8 kVTbl4x4TransposeDi =
+ { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 };
+
+void TransposeUVWx8_NEON(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width) {
+ asm volatile (
+ // loops are on blocks of 8. loop will stop when
+ // counter gets to or below 0. starting the counter
+ // at w-8 allow for this
+ "sub %6, #8 \n"
+
+ // handle 8x8 blocks. this should be the majority of the plane
+ ".p2align 4 \n"
+ "1: \n"
+ "mov r9, %0 \n"
+
+ "vld2.8 {d0, d1}, [r9], %1 \n"
+ "vld2.8 {d2, d3}, [r9], %1 \n"
+ "vld2.8 {d4, d5}, [r9], %1 \n"
+ "vld2.8 {d6, d7}, [r9], %1 \n"
+ "vld2.8 {d16, d17}, [r9], %1 \n"
+ "vld2.8 {d18, d19}, [r9], %1 \n"
+ "vld2.8 {d20, d21}, [r9], %1 \n"
+ "vld2.8 {d22, d23}, [r9] \n"
+
+ "vtrn.8 q1, q0 \n"
+ "vtrn.8 q3, q2 \n"
+ "vtrn.8 q9, q8 \n"
+ "vtrn.8 q11, q10 \n"
+
+ "vtrn.16 q1, q3 \n"
+ "vtrn.16 q0, q2 \n"
+ "vtrn.16 q9, q11 \n"
+ "vtrn.16 q8, q10 \n"
+
+ "vtrn.32 q1, q9 \n"
+ "vtrn.32 q0, q8 \n"
+ "vtrn.32 q3, q11 \n"
+ "vtrn.32 q2, q10 \n"
+
+ "vrev16.8 q0, q0 \n"
+ "vrev16.8 q1, q1 \n"
+ "vrev16.8 q2, q2 \n"
+ "vrev16.8 q3, q3 \n"
+ "vrev16.8 q8, q8 \n"
+ "vrev16.8 q9, q9 \n"
+ "vrev16.8 q10, q10 \n"
+ "vrev16.8 q11, q11 \n"
+
+ "mov r9, %2 \n"
+
+ "vst1.8 {d2}, [r9], %3 \n"
+ "vst1.8 {d0}, [r9], %3 \n"
+ "vst1.8 {d6}, [r9], %3 \n"
+ "vst1.8 {d4}, [r9], %3 \n"
+ "vst1.8 {d18}, [r9], %3 \n"
+ "vst1.8 {d16}, [r9], %3 \n"
+ "vst1.8 {d22}, [r9], %3 \n"
+ "vst1.8 {d20}, [r9] \n"
+
+ "mov r9, %4 \n"
+
+ "vst1.8 {d3}, [r9], %5 \n"
+ "vst1.8 {d1}, [r9], %5 \n"
+ "vst1.8 {d7}, [r9], %5 \n"
+ "vst1.8 {d5}, [r9], %5 \n"
+ "vst1.8 {d19}, [r9], %5 \n"
+ "vst1.8 {d17}, [r9], %5 \n"
+ "vst1.8 {d23}, [r9], %5 \n"
+ "vst1.8 {d21}, [r9] \n"
+
+ "add %0, #8*2 \n" // src += 8*2
+ "add %2, %2, %3, lsl #3 \n" // dst_a += 8 * dst_stride_a
+ "add %4, %4, %5, lsl #3 \n" // dst_b += 8 * dst_stride_b
+ "subs %6, #8 \n" // w -= 8
+ "bge 1b \n"
+
+ // add 8 back to counter. if the result is 0 there are
+ // no residuals.
+ "adds %6, #8 \n"
+ "beq 4f \n"
+
+ // some residual, so between 1 and 7 lines left to transpose
+ "cmp %6, #2 \n"
+ "blt 3f \n"
+
+ "cmp %6, #4 \n"
+ "blt 2f \n"
+
+ //TODO(frkoenig) : clean this up
+ // 4x8 block
+ "mov r9, %0 \n"
+ "vld1.64 {d0}, [r9], %1 \n"
+ "vld1.64 {d1}, [r9], %1 \n"
+ "vld1.64 {d2}, [r9], %1 \n"
+ "vld1.64 {d3}, [r9], %1 \n"
+ "vld1.64 {d4}, [r9], %1 \n"
+ "vld1.64 {d5}, [r9], %1 \n"
+ "vld1.64 {d6}, [r9], %1 \n"
+ "vld1.64 {d7}, [r9] \n"
+
+ "vld1.8 {q15}, [%7] \n"
+
+ "vtrn.8 q0, q1 \n"
+ "vtrn.8 q2, q3 \n"
+
+ "vtbl.8 d16, {d0, d1}, d30 \n"
+ "vtbl.8 d17, {d0, d1}, d31 \n"
+ "vtbl.8 d18, {d2, d3}, d30 \n"
+ "vtbl.8 d19, {d2, d3}, d31 \n"
+ "vtbl.8 d20, {d4, d5}, d30 \n"
+ "vtbl.8 d21, {d4, d5}, d31 \n"
+ "vtbl.8 d22, {d6, d7}, d30 \n"
+ "vtbl.8 d23, {d6, d7}, d31 \n"
+
+ "mov r9, %2 \n"
+
+ "vst1.32 {d16[0]}, [r9], %3 \n"
+ "vst1.32 {d16[1]}, [r9], %3 \n"
+ "vst1.32 {d17[0]}, [r9], %3 \n"
+ "vst1.32 {d17[1]}, [r9], %3 \n"
+
+ "add r9, %2, #4 \n"
+ "vst1.32 {d20[0]}, [r9], %3 \n"
+ "vst1.32 {d20[1]}, [r9], %3 \n"
+ "vst1.32 {d21[0]}, [r9], %3 \n"
+ "vst1.32 {d21[1]}, [r9] \n"
+
+ "mov r9, %4 \n"
+
+ "vst1.32 {d18[0]}, [r9], %5 \n"
+ "vst1.32 {d18[1]}, [r9], %5 \n"
+ "vst1.32 {d19[0]}, [r9], %5 \n"
+ "vst1.32 {d19[1]}, [r9], %5 \n"
+
+ "add r9, %4, #4 \n"
+ "vst1.32 {d22[0]}, [r9], %5 \n"
+ "vst1.32 {d22[1]}, [r9], %5 \n"
+ "vst1.32 {d23[0]}, [r9], %5 \n"
+ "vst1.32 {d23[1]}, [r9] \n"
+
+ "add %0, #4*2 \n" // src += 4 * 2
+ "add %2, %2, %3, lsl #2 \n" // dst_a += 4 * dst_stride_a
+ "add %4, %4, %5, lsl #2 \n" // dst_b += 4 * dst_stride_b
+ "subs %6, #4 \n" // w -= 4
+ "beq 4f \n"
+
+ // some residual, check to see if it includes a 2x8 block,
+ // or less
+ "cmp %6, #2 \n"
+ "blt 3f \n"
+
+ // 2x8 block
+ "2: \n"
+ "mov r9, %0 \n"
+ "vld2.16 {d0[0], d2[0]}, [r9], %1 \n"
+ "vld2.16 {d1[0], d3[0]}, [r9], %1 \n"
+ "vld2.16 {d0[1], d2[1]}, [r9], %1 \n"
+ "vld2.16 {d1[1], d3[1]}, [r9], %1 \n"
+ "vld2.16 {d0[2], d2[2]}, [r9], %1 \n"
+ "vld2.16 {d1[2], d3[2]}, [r9], %1 \n"
+ "vld2.16 {d0[3], d2[3]}, [r9], %1 \n"
+ "vld2.16 {d1[3], d3[3]}, [r9] \n"
+
+ "vtrn.8 d0, d1 \n"
+ "vtrn.8 d2, d3 \n"
+
+ "mov r9, %2 \n"
+
+ "vst1.64 {d0}, [r9], %3 \n"
+ "vst1.64 {d2}, [r9] \n"
+
+ "mov r9, %4 \n"
+
+ "vst1.64 {d1}, [r9], %5 \n"
+ "vst1.64 {d3}, [r9] \n"
+
+ "add %0, #2*2 \n" // src += 2 * 2
+ "add %2, %2, %3, lsl #1 \n" // dst_a += 2 * dst_stride_a
+ "add %4, %4, %5, lsl #1 \n" // dst_b += 2 * dst_stride_b
+ "subs %6, #2 \n" // w -= 2
+ "beq 4f \n"
+
+ // 1x8 block
+ "3: \n"
+ "vld2.8 {d0[0], d1[0]}, [%0], %1 \n"
+ "vld2.8 {d0[1], d1[1]}, [%0], %1 \n"
+ "vld2.8 {d0[2], d1[2]}, [%0], %1 \n"
+ "vld2.8 {d0[3], d1[3]}, [%0], %1 \n"
+ "vld2.8 {d0[4], d1[4]}, [%0], %1 \n"
+ "vld2.8 {d0[5], d1[5]}, [%0], %1 \n"
+ "vld2.8 {d0[6], d1[6]}, [%0], %1 \n"
+ "vld2.8 {d0[7], d1[7]}, [%0] \n"
+
+ "vst1.64 {d0}, [%2] \n"
+ "vst1.64 {d1}, [%4] \n"
+
+ "4: \n"
+
+ : "+r"(src), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst_a), // %2
+ "+r"(dst_stride_a), // %3
+ "+r"(dst_b), // %4
+ "+r"(dst_stride_b), // %5
+ "+r"(width) // %6
+ : "r"(&kVTbl4x4TransposeDi) // %7
+ : "memory", "cc", "r9",
+ "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+ );
+}
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/rotate_neon.s b/files/source/rotate_neon.s
deleted file mode 100644
index 75ea957a..00000000
--- a/files/source/rotate_neon.s
+++ /dev/null
@@ -1,563 +0,0 @@
- .global RestoreRegisters_NEON
- .global ReverseLine_NEON
- .global ReverseLineUV_NEON
- .global SaveRegisters_NEON
- .global TransposeWx8_NEON
- .global TransposeUVWx8_NEON
- .type RestoreRegisters_NEON, function
- .type ReverseLine_NEON, function
- .type ReverseLineUV_NEON, function
- .type SaveRegisters_NEON, function
- .type TransposeWx8_NEON, function
- .type TransposeUVWx8_NEON, function
-
-@ void ReverseLine_NEON (const uint8* src, uint8* dst, int width)
-@ r0 const uint8* src
-@ r1 uint8* dst
-@ r2 width
-ReverseLine_NEON:
-
- @ compute where to start writing destination
- add r1, r2 @ dst + width
-
- @ work on segments that are multiples of 16
- lsrs r3, r2, #4
-
- @ the output is written in two block. 8 bytes followed
- @ by another 8. reading is done sequentially, from left to
- @ right. writing is done from right to left in block sizes
- @ r1, the destination pointer is incremented after writing
- @ the first of the two blocks. need to subtract that 8 off
- @ along with 16 to get the next location.
- mov r3, #-24
-
- beq Lline_residuals
-
- @ back of destination by the size of the register that is
- @ going to be reversed
- sub r1, #16
-
- @ the loop needs to run on blocks of 16. what will be left
- @ over is either a negative number, the residuals that need
- @ to be done, or 0. if this isn't subtracted off here the
- @ loop will run one extra time.
- sub r2, #16
-
-Lsegments_of_16:
- vld1.8 {q0}, [r0]! @ src += 16
-
- @ reverse the bytes in the 64 bit segments. unable to reverse
- @ the bytes in the entire 128 bits in one go.
- vrev64.8 q0, q0
-
- @ because of the inability to reverse the entire 128 bits
- @ reverse the writing out of the two 64 bit segments.
- vst1.8 {d1}, [r1]!
- vst1.8 {d0}, [r1], r3 @ dst -= 16
-
- subs r2, #16
- bge Lsegments_of_16
-
- @ add 16 back to the counter. if the result is 0 there is no
- @ residuals so return
- adds r2, #16
- bxeq lr
-
- add r1, #16
-
-Lline_residuals:
-
- mov r3, #-3
-
- sub r1, #2
- subs r2, #2
- @ check for 16*n+1 scenarios where segments_of_2 should not
- @ be run, but there is something left over.
- blt Lsegment_of_1
-
-@ do this in neon registers as per
-@ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
-Lsegments_of_2:
- vld2.8 {d0[0], d1[0]}, [r0]! @ src += 2
-
- vst1.8 {d1[0]}, [r1]!
- vst1.8 {d0[0]}, [r1], r3 @ dst -= 2
-
- subs r2, #2
- bge Lsegments_of_2
-
- adds r2, #2
- bxeq lr
-
-Lsegment_of_1:
- add r1, #1
- vld1.8 {d0[0]}, [r0]
- vst1.8 {d0[0]}, [r1]
-
- bx lr
-
-@ void TransposeWx8_NEON (const uint8* src, int src_stride,
-@ uint8* dst, int dst_stride,
-@ int w)
-@ r0 const uint8* src
-@ r1 int src_stride
-@ r2 uint8* dst
-@ r3 int dst_stride
-@ stack int w
-TransposeWx8_NEON:
- push {r4,r8,r9,lr}
-
- ldr r8, [sp, #16] @ width
-
- @ loops are on blocks of 8. loop will stop when
- @ counter gets to or below 0. starting the counter
- @ at w-8 allow for this
- sub r8, #8
-
-@ handle 8x8 blocks. this should be the majority of the plane
-Lloop_8x8:
- mov r9, r0
-
- vld1.8 {d0}, [r9], r1
- vld1.8 {d1}, [r9], r1
- vld1.8 {d2}, [r9], r1
- vld1.8 {d3}, [r9], r1
- vld1.8 {d4}, [r9], r1
- vld1.8 {d5}, [r9], r1
- vld1.8 {d6}, [r9], r1
- vld1.8 {d7}, [r9]
-
- vtrn.8 d1, d0
- vtrn.8 d3, d2
- vtrn.8 d5, d4
- vtrn.8 d7, d6
-
- vtrn.16 d1, d3
- vtrn.16 d0, d2
- vtrn.16 d5, d7
- vtrn.16 d4, d6
-
- vtrn.32 d1, d5
- vtrn.32 d0, d4
- vtrn.32 d3, d7
- vtrn.32 d2, d6
-
- vrev16.8 q0, q0
- vrev16.8 q1, q1
- vrev16.8 q2, q2
- vrev16.8 q3, q3
-
- mov r9, r2
-
- vst1.8 {d1}, [r9], r3
- vst1.8 {d0}, [r9], r3
- vst1.8 {d3}, [r9], r3
- vst1.8 {d2}, [r9], r3
- vst1.8 {d5}, [r9], r3
- vst1.8 {d4}, [r9], r3
- vst1.8 {d7}, [r9], r3
- vst1.8 {d6}, [r9]
-
- add r0, #8 @ src += 8
- add r2, r3, lsl #3 @ dst += 8 * dst_stride
- subs r8, #8 @ w -= 8
- bge Lloop_8x8
-
- @ add 8 back to counter. if the result is 0 there are
- @ no residuals.
- adds r8, #8
- beq Ldone
-
- @ some residual, so between 1 and 7 lines left to transpose
- cmp r8, #2
- blt Lblock_1x8
-
- cmp r8, #4
- blt Lblock_2x8
-
-Lblock_4x8:
- mov r9, r0
- vld1.32 {d0[0]}, [r9], r1
- vld1.32 {d0[1]}, [r9], r1
- vld1.32 {d1[0]}, [r9], r1
- vld1.32 {d1[1]}, [r9], r1
- vld1.32 {d2[0]}, [r9], r1
- vld1.32 {d2[1]}, [r9], r1
- vld1.32 {d3[0]}, [r9], r1
- vld1.32 {d3[1]}, [r9]
-
- mov r9, r2
-
- adr r12, vtbl_4x4_transpose
- vld1.8 {q3}, [r12]
-
- vtbl.8 d4, {d0, d1}, d6
- vtbl.8 d5, {d0, d1}, d7
- vtbl.8 d0, {d2, d3}, d6
- vtbl.8 d1, {d2, d3}, d7
-
- @ TODO: rework shuffle above to write
- @ out with 4 instead of 8 writes
- vst1.32 {d4[0]}, [r9], r3
- vst1.32 {d4[1]}, [r9], r3
- vst1.32 {d5[0]}, [r9], r3
- vst1.32 {d5[1]}, [r9]
-
- add r9, r2, #4
- vst1.32 {d0[0]}, [r9], r3
- vst1.32 {d0[1]}, [r9], r3
- vst1.32 {d1[0]}, [r9], r3
- vst1.32 {d1[1]}, [r9]
-
- add r0, #4 @ src += 4
- add r2, r3, lsl #2 @ dst += 4 * dst_stride
- subs r8, #4 @ w -= 4
- beq Ldone
-
- @ some residual, check to see if it includes a 2x8 block,
- @ or less
- cmp r8, #2
- blt Lblock_1x8
-
-Lblock_2x8:
- mov r9, r0
- vld1.16 {d0[0]}, [r9], r1
- vld1.16 {d1[0]}, [r9], r1
- vld1.16 {d0[1]}, [r9], r1
- vld1.16 {d1[1]}, [r9], r1
- vld1.16 {d0[2]}, [r9], r1
- vld1.16 {d1[2]}, [r9], r1
- vld1.16 {d0[3]}, [r9], r1
- vld1.16 {d1[3]}, [r9]
-
- vtrn.8 d0, d1
-
- mov r9, r2
-
- vst1.64 {d0}, [r9], r3
- vst1.64 {d1}, [r9]
-
- add r0, #2 @ src += 2
- add r2, r3, lsl #1 @ dst += 2 * dst_stride
- subs r8, #2 @ w -= 2
- beq Ldone
-
-Lblock_1x8:
- vld1.8 {d0[0]}, [r0], r1
- vld1.8 {d0[1]}, [r0], r1
- vld1.8 {d0[2]}, [r0], r1
- vld1.8 {d0[3]}, [r0], r1
- vld1.8 {d0[4]}, [r0], r1
- vld1.8 {d0[5]}, [r0], r1
- vld1.8 {d0[6]}, [r0], r1
- vld1.8 {d0[7]}, [r0]
-
- vst1.64 {d0}, [r2]
-
-Ldone:
-
- pop {r4,r8,r9,pc}
-
-vtbl_4x4_transpose:
- .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
-
-@ void SaveRegisters_NEON (unsigned long long store)
-@ r0 unsigned long long store
-SaveRegisters_NEON:
- vst1.i64 {d8, d9, d10, d11}, [r0]!
- vst1.i64 {d12, d13, d14, d15}, [r0]!
- bx lr
-
-@ void RestoreRegisters_NEON (unsigned long long store)
-@ r0 unsigned long long store
-RestoreRegisters_NEON:
- vld1.i64 {d8, d9, d10, d11}, [r0]!
- vld1.i64 {d12, d13, d14, d15}, [r0]!
- bx lr
-
-@ void ReverseLineUV_NEON (const uint8* src,
-@ uint8* dst_a,
-@ uint8* dst_b,
-@ int width)
-@ r0 const uint8* src
-@ r1 uint8* dst_a
-@ r2 uint8* dst_b
-@ r3 width
-ReverseLineUV_NEON:
-
- @ compute where to start writing destination
- add r1, r1, r3 @ dst_a + width
- add r2, r2, r3 @ dst_b + width
-
- @ work on input segments that are multiples of 16, but
- @ width that has been passed is output segments, half
- @ the size of input.
- lsrs r12, r3, #3
-
- beq Lline_residuals_di
-
- @ the output is written in to two blocks.
- mov r12, #-8
-
- @ back of destination by the size of the register that is
- @ going to be reversed
- sub r1, r1, #8
- sub r2, r2, #8
-
- @ the loop needs to run on blocks of 8. what will be left
- @ over is either a negative number, the residuals that need
- @ to be done, or 0. if this isn't subtracted off here the
- @ loop will run one extra time.
- sub r3, r3, #8
-
-Lsegments_of_8_di:
- vld2.8 {d0, d1}, [r0]! @ src += 16
-
- @ reverse the bytes in the 64 bit segments
- vrev64.8 q0, q0
-
- vst1.8 {d0}, [r1], r12 @ dst_a -= 8
- vst1.8 {d1}, [r2], r12 @ dst_b -= 8
-
- subs r3, r3, #8
- bge Lsegments_of_8_di
-
- @ add 8 back to the counter. if the result is 0 there is no
- @ residuals so return
- adds r3, r3, #8
- bxeq lr
-
- add r1, r1, #8
- add r2, r2, #8
-
-Lline_residuals_di:
-
- mov r12, #-1
-
- sub r1, r1, #1
- sub r2, r2, #1
-
-@ do this in neon registers as per
-@ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
-Lsegments_of_1:
- vld2.8 {d0[0], d1[0]}, [r0]! @ src += 2
-
- vst1.8 {d0[0]}, [r1], r12 @ dst_a -= 1
- vst1.8 {d1[0]}, [r2], r12 @ dst_b -= 1
-
- subs r3, r3, #1
- bgt Lsegments_of_1
-
- bx lr
-
-@ void TransposeUVWx8_NEON (const uint8* src, int src_stride,
-@ uint8* dst_a, int dst_stride_a,
-@ uint8* dst_b, int dst_stride_b,
-@ int width)
-@ r0 const uint8* src
-@ r1 int src_stride
-@ r2 uint8* dst_a
-@ r3 int dst_stride_a
-@ stack uint8* dst_b
-@ stack int dst_stride_b
-@ stack int width
-TransposeUVWx8_NEON:
- push {r4-r9,lr}
-
- ldr r4, [sp, #28] @ dst_b
- ldr r5, [sp, #32] @ dst_stride_b
- ldr r8, [sp, #36] @ width
- @ loops are on blocks of 8. loop will stop when
- @ counter gets to or below 0. starting the counter
- @ at w-8 allow for this
- sub r8, #8
-
-@ handle 8x8 blocks. this should be the majority of the plane
-Lloop_8x8_di:
- mov r9, r0
-
- vld2.8 {d0, d1}, [r9], r1
- vld2.8 {d2, d3}, [r9], r1
- vld2.8 {d4, d5}, [r9], r1
- vld2.8 {d6, d7}, [r9], r1
- vld2.8 {d8, d9}, [r9], r1
- vld2.8 {d10, d11}, [r9], r1
- vld2.8 {d12, d13}, [r9], r1
- vld2.8 {d14, d15}, [r9]
-
- vtrn.8 q1, q0
- vtrn.8 q3, q2
- vtrn.8 q5, q4
- vtrn.8 q7, q6
-
- vtrn.16 q1, q3
- vtrn.16 q0, q2
- vtrn.16 q5, q7
- vtrn.16 q4, q6
-
- vtrn.32 q1, q5
- vtrn.32 q0, q4
- vtrn.32 q3, q7
- vtrn.32 q2, q6
-
- vrev16.8 q0, q0
- vrev16.8 q1, q1
- vrev16.8 q2, q2
- vrev16.8 q3, q3
- vrev16.8 q4, q4
- vrev16.8 q5, q5
- vrev16.8 q6, q6
- vrev16.8 q7, q7
-
- mov r9, r2
-
- vst1.8 {d2}, [r9], r3
- vst1.8 {d0}, [r9], r3
- vst1.8 {d6}, [r9], r3
- vst1.8 {d4}, [r9], r3
- vst1.8 {d10}, [r9], r3
- vst1.8 {d8}, [r9], r3
- vst1.8 {d14}, [r9], r3
- vst1.8 {d12}, [r9]
-
- mov r9, r4
-
- vst1.8 {d3}, [r9], r5
- vst1.8 {d1}, [r9], r5
- vst1.8 {d7}, [r9], r5
- vst1.8 {d5}, [r9], r5
- vst1.8 {d11}, [r9], r5
- vst1.8 {d9}, [r9], r5
- vst1.8 {d15}, [r9], r5
- vst1.8 {d13}, [r9]
-
- add r0, #8*2 @ src += 8*2
- add r2, r3, lsl #3 @ dst_a += 8 * dst_stride_a
- add r4, r5, lsl #3 @ dst_b += 8 * dst_stride_b
- subs r8, #8 @ w -= 8
- bge Lloop_8x8_di
-
- @ add 8 back to counter. if the result is 0 there are
- @ no residuals.
- adds r8, #8
- beq Ldone_di
-
- @ some residual, so between 1 and 7 lines left to transpose
- cmp r8, #2
- blt Lblock_1x8_di
-
- cmp r8, #4
- blt Lblock_2x8_di
-
-@ TODO(frkoenig) : clean this up
-Lblock_4x8_di:
- mov r9, r0
- vld1.64 {d0}, [r9], r1
- vld1.64 {d1}, [r9], r1
- vld1.64 {d2}, [r9], r1
- vld1.64 {d3}, [r9], r1
- vld1.64 {d4}, [r9], r1
- vld1.64 {d5}, [r9], r1
- vld1.64 {d6}, [r9], r1
- vld1.64 {d7}, [r9]
-
- adr r12, vtbl_4x4_transpose_di
- vld1.8 {q7}, [r12]
-
- vtrn.8 q0, q1
- vtrn.8 q2, q3
-
- vtbl.8 d8, {d0, d1}, d14
- vtbl.8 d9, {d0, d1}, d15
- vtbl.8 d10, {d2, d3}, d14
- vtbl.8 d11, {d2, d3}, d15
- vtbl.8 d12, {d4, d5}, d14
- vtbl.8 d13, {d4, d5}, d15
- vtbl.8 d0, {d6, d7}, d14
- vtbl.8 d1, {d6, d7}, d15
-
- mov r9, r2
-
- vst1.32 {d8[0]}, [r9], r3
- vst1.32 {d8[1]}, [r9], r3
- vst1.32 {d9[0]}, [r9], r3
- vst1.32 {d9[1]}, [r9], r3
-
- add r9, r2, #4
- vst1.32 {d12[0]}, [r9], r3
- vst1.32 {d12[1]}, [r9], r3
- vst1.32 {d13[0]}, [r9], r3
- vst1.32 {d13[1]}, [r9]
-
- mov r9, r4
-
- vst1.32 {d10[0]}, [r9], r5
- vst1.32 {d10[1]}, [r9], r5
- vst1.32 {d11[0]}, [r9], r5
- vst1.32 {d11[1]}, [r9], r5
-
- add r9, r4, #4
- vst1.32 {d0[0]}, [r9], r5
- vst1.32 {d0[1]}, [r9], r5
- vst1.32 {d1[0]}, [r9], r5
- vst1.32 {d1[1]}, [r9]
-
- add r0, #4*2 @ src += 4 * 2
- add r2, r3, lsl #2 @ dst_a += 4 * dst_stride_a
- add r4, r5, lsl #2 @ dst_b += 4 * dst_stride_b
- subs r8, #4 @ w -= 4
- beq Ldone_di
-
- @ some residual, check to see if it includes a 2x8 block,
- @ or less
- cmp r8, #2
- blt Lblock_1x8_di
-
-Lblock_2x8_di:
- mov r9, r0
- vld2.16 {d0[0], d2[0]}, [r9], r1
- vld2.16 {d1[0], d3[0]}, [r9], r1
- vld2.16 {d0[1], d2[1]}, [r9], r1
- vld2.16 {d1[1], d3[1]}, [r9], r1
- vld2.16 {d0[2], d2[2]}, [r9], r1
- vld2.16 {d1[2], d3[2]}, [r9], r1
- vld2.16 {d0[3], d2[3]}, [r9], r1
- vld2.16 {d1[3], d3[3]}, [r9]
-
- vtrn.8 d0, d1
- vtrn.8 d2, d3
-
- mov r9, r2
-
- vst1.64 {d0}, [r9], r3
- vst1.64 {d2}, [r9]
-
- mov r9, r4
-
- vst1.64 {d1}, [r9], r5
- vst1.64 {d3}, [r9]
-
- add r0, #2*2 @ src += 2 * 2
- add r2, r3, lsl #1 @ dst_a += 2 * dst_stride_a
- add r4, r5, lsl #1 @ dst_a += 2 * dst_stride_a
- subs r8, #2 @ w -= 2
- beq Ldone_di
-
-Lblock_1x8_di:
- vld2.8 {d0[0], d1[0]}, [r0], r1
- vld2.8 {d0[1], d1[1]}, [r0], r1
- vld2.8 {d0[2], d1[2]}, [r0], r1
- vld2.8 {d0[3], d1[3]}, [r0], r1
- vld2.8 {d0[4], d1[4]}, [r0], r1
- vld2.8 {d0[5], d1[5]}, [r0], r1
- vld2.8 {d0[6], d1[6]}, [r0], r1
- vld2.8 {d0[7], d1[7]}, [r0]
-
- vst1.64 {d0}, [r2]
- vst1.64 {d1}, [r4]
-
-Ldone_di:
- pop {r4-r9, pc}
-
-vtbl_4x4_transpose_di:
- .byte 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
diff --git a/files/source/rotate_priv.h b/files/source/rotate_priv.h
deleted file mode 100644
index b4df1494..00000000
--- a/files/source/rotate_priv.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef SOURCE_ROTATE_PRIV_H_
-#define SOURCE_ROTATE_PRIV_H_
-
-#include "libyuv/basic_types.h"
-
-namespace libyuv {
-
-// Rotate planes by 90, 180, 270
-void
-RotatePlane90(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width, int height);
-
-void
-RotatePlane180(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width, int height);
-
-void
-RotatePlane270(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width, int height);
-
-void
-RotateUV90(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int width, int height);
-
-// Rotations for when U and V are interleaved.
-// These functions take one input pointer and
-// split the data into two buffers while
-// rotating them.
-void
-RotateUV180(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int width, int height);
-
-void
-RotateUV270(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int width, int height);
-
-// The 90 and 270 functions are based on transposes.
-// Doing a transpose with reversing the read/write
-// order will result in a rotation by +- 90 degrees.
-void
-TransposePlane(const uint8* src, int src_stride,
- uint8* dst, int dst_stride,
- int width, int height);
-
-void
-TransposeUV(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int width, int height);
-
-} // namespace libyuv
-
-#endif // SOURCE_ROTATE_PRIV_H_
diff --git a/files/source/row.h b/files/source/row.h
deleted file mode 100644
index 85343c56..00000000
--- a/files/source/row.h
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef LIBYUV_SOURCE_ROW_H_
-#define LIBYUV_SOURCE_ROW_H_
-
-#include "libyuv/basic_types.h"
-
-// The following are available on all x86 platforms
-#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \
- && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
-#define HAS_ARGBTOYROW_SSSE3
-#define HAS_BG24TOARGBROW_SSSE3
-#define HAS_RAWTOARGBROW_SSSE3
-#define HAS_RGB24TOYROW_SSSE3
-#define HAS_RAWTOYROW_SSSE3
-#define HAS_RGB24TOUVROW_SSSE3
-#define HAS_RAWTOUVROW_SSSE3
-#endif
-
-// The following are available only on Windows
-#if defined(WIN32) \
- && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
-#define HAS_BGRATOYROW_SSSE3
-#define HAS_ABGRTOYROW_SSSE3
-#define HAS_ARGBTOUVROW_SSSE3
-#define HAS_BGRATOUVROW_SSSE3
-#define HAS_ABGRTOUVROW_SSSE3
-#endif
-
-extern "C" {
-#ifdef HAS_ARGBTOYROW_SSSE3
-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-#endif
-#if defined(HAS_BG24TOARGBROW_SSSE3) && defined(HAS_ARGBTOYROW_SSSE3)
-#define HASRGB24TOYROW_SSSE3
-#endif
-#ifdef HASRGB24TOYROW_SSSE3
-void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void RGB24ToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-void RAWToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-#endif
-void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
-void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
-void ABGRToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
-void RGB24ToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
-void RAWToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
-void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_C(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_C(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-void RGB24ToUVRow_C(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-void RAWToUVRow_C(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-
-#ifdef HAS_BG24TOARGBROW_SSSE3
-void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix);
-void RAWToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix);
-#endif
-void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix);
-void RAWToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix);
-
-#if defined(_MSC_VER)
-#define SIMD_ALIGNED(var) __declspec(align(16)) var
-#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
-#else
-#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
-#define TALIGN16(t, var) t var __attribute__((aligned(16)))
-#endif
-
-#ifdef OSX
-extern SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]);
-extern SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]);
-extern SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]);
-#else
-extern SIMD_ALIGNED(const int16 _kCoefficientsRgbY[768][4]);
-extern SIMD_ALIGNED(const int16 _kCoefficientsBgraY[768][4]);
-extern SIMD_ALIGNED(const int16 _kCoefficientsAbgrY[768][4]);
-#endif
-void FastConvertYUVToRGB32Row(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width);
-
-void FastConvertYUVToBGRARow(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width);
-
-void FastConvertYUVToABGRRow(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width);
-
-void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width);
-
-void FastConvertYToRGB32Row(const uint8* y_buf,
- uint8* rgb_buf,
- int width);
-
-// Method to force C version.
-//#define USE_MMX 0
-//#define USE_SSE2 0
-
-#if !defined(USE_MMX)
-// Windows, Mac and Linux use MMX
-#if defined(__i386__) || defined(_MSC_VER)
-#define USE_MMX 1
-#else
-#define USE_MMX 0
-#endif
-#endif
-
-#if !defined(USE_SSE2)
-#if defined(__SSE2__) || defined(ARCH_CPU_X86_64) || _M_IX86_FP==2
-#define USE_SSE2 1
-#else
-#define USE_SSE2 0
-#endif
-#endif
-
-// x64 uses MMX2 (SSE) so emms is not required.
-// Warning C4799: function has no EMMS instruction.
-// EMMS() is slow and should be called by the calling function once per image.
-#if USE_MMX && !defined(ARCH_CPU_X86_64)
-#if defined(_MSC_VER)
-#define EMMS() __asm emms
-#pragma warning(disable: 4799)
-#else
-#define EMMS() asm("emms")
-#endif
-#else
-#define EMMS()
-#endif
-
-
-} // extern "C"
-
-#endif // LIBYUV_SOURCE_ROW_H_
diff --git a/files/source/row_common.cc b/files/source/row_common.cc
new file mode 100644
index 00000000..c5f3ce05
--- /dev/null
+++ b/files/source/row_common.cc
@@ -0,0 +1,1246 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include <string.h> // For memcpy
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int width) {
+ for (int x = 0; x < width; ++x) {
+ // To support in-place conversion.
+ uint8 a = src_bgra[0];
+ uint8 r = src_bgra[1];
+ uint8 g = src_bgra[2];
+ uint8 b = src_bgra[3];
+ dst_argb[0] = b;
+ dst_argb[1] = g;
+ dst_argb[2] = r;
+ dst_argb[3] = a;
+ dst_argb += 4;
+ src_bgra += 4;
+ }
+}
+
+void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int width) {
+ for (int x = 0; x < width; ++x) {
+ // To support in-place conversion.
+ uint8 r = src_abgr[0];
+ uint8 g = src_abgr[1];
+ uint8 b = src_abgr[2];
+ uint8 a = src_abgr[3];
+ dst_argb[0] = b;
+ dst_argb[1] = g;
+ dst_argb[2] = r;
+ dst_argb[3] = a;
+ dst_argb += 4;
+ src_abgr += 4;
+ }
+}
+
+void RGBAToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int width) {
+ for (int x = 0; x < width; ++x) {
+ // To support in-place conversion.
+ uint8 a = src_abgr[0];
+ uint8 b = src_abgr[1];
+ uint8 g = src_abgr[2];
+ uint8 r = src_abgr[3];
+ dst_argb[0] = b;
+ dst_argb[1] = g;
+ dst_argb[2] = r;
+ dst_argb[3] = a;
+ dst_argb += 4;
+ src_abgr += 4;
+ }
+}
+
+void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
+ for (int x = 0; x < width; ++x) {
+ uint8 b = src_rgb24[0];
+ uint8 g = src_rgb24[1];
+ uint8 r = src_rgb24[2];
+ dst_argb[0] = b;
+ dst_argb[1] = g;
+ dst_argb[2] = r;
+ dst_argb[3] = 255u;
+ dst_argb += 4;
+ src_rgb24 += 3;
+ }
+}
+
+void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
+ for (int x = 0; x < width; ++x) {
+ uint8 r = src_raw[0];
+ uint8 g = src_raw[1];
+ uint8 b = src_raw[2];
+ dst_argb[0] = b;
+ dst_argb[1] = g;
+ dst_argb[2] = r;
+ dst_argb[3] = 255u;
+ dst_argb += 4;
+ src_raw += 3;
+ }
+}
+
+void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) {
+ for (int x = 0; x < width; ++x) {
+ uint8 b = src_rgb[0] & 0x1f;
+ uint8 g = (src_rgb[0] >> 5) | ((src_rgb[1] & 0x07) << 3);
+ uint8 r = src_rgb[1] >> 3;
+ dst_argb[0] = (b << 3) | (b >> 2);
+ dst_argb[1] = (g << 2) | (g >> 4);
+ dst_argb[2] = (r << 3) | (r >> 2);
+ dst_argb[3] = 255u;
+ dst_argb += 4;
+ src_rgb += 2;
+ }
+}
+
+void ARGB1555ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) {
+ for (int x = 0; x < width; ++x) {
+ uint8 b = src_rgb[0] & 0x1f;
+ uint8 g = (src_rgb[0] >> 5) | ((src_rgb[1] & 0x03) << 3);
+ uint8 r = (src_rgb[1] & 0x7c) >> 2;
+ uint8 a = src_rgb[1] >> 7;
+ dst_argb[0] = (b << 3) | (b >> 2);
+ dst_argb[1] = (g << 3) | (g >> 2);
+ dst_argb[2] = (r << 3) | (r >> 2);
+ dst_argb[3] = -a;
+ dst_argb += 4;
+ src_rgb += 2;
+ }
+}
+
+void ARGB4444ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) {
+ for (int x = 0; x < width; ++x) {
+ uint8 b = src_rgb[0] & 0x0f;
+ uint8 g = src_rgb[0] >> 4;
+ uint8 r = src_rgb[1] & 0x0f;
+ uint8 a = src_rgb[1] >> 4;
+ dst_argb[0] = (b << 4) | b;
+ dst_argb[1] = (g << 4) | g;
+ dst_argb[2] = (r << 4) | r;
+ dst_argb[3] = (a << 4) | a;
+ dst_argb += 4;
+ src_rgb += 2;
+ }
+}
+
+void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+ for (int x = 0; x < width; ++x) {
+ uint8 b = src_argb[0];
+ uint8 g = src_argb[1];
+ uint8 r = src_argb[2];
+ uint8 a = src_argb[3];
+ dst_rgb[0] = a;
+ dst_rgb[1] = b;
+ dst_rgb[2] = g;
+ dst_rgb[3] = r;
+ dst_rgb += 4;
+ src_argb += 4;
+ }
+}
+
+void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+ for (int x = 0; x < width; ++x) {
+ uint8 b = src_argb[0];
+ uint8 g = src_argb[1];
+ uint8 r = src_argb[2];
+ dst_rgb[0] = b;
+ dst_rgb[1] = g;
+ dst_rgb[2] = r;
+ dst_rgb += 3;
+ src_argb += 4;
+ }
+}
+
+void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+ for (int x = 0; x < width; ++x) {
+ uint8 b = src_argb[0];
+ uint8 g = src_argb[1];
+ uint8 r = src_argb[2];
+ dst_rgb[0] = r;
+ dst_rgb[1] = g;
+ dst_rgb[2] = b;
+ dst_rgb += 3;
+ src_argb += 4;
+ }
+}
+
+// TODO(fbarchard): support big endian CPU
+void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ uint8 b0 = src_argb[0] >> 3;
+ uint8 g0 = src_argb[1] >> 2;
+ uint8 r0 = src_argb[2] >> 3;
+ uint8 b1 = src_argb[4] >> 3;
+ uint8 g1 = src_argb[5] >> 2;
+ uint8 r1 = src_argb[6] >> 3;
+ *reinterpret_cast<uint32*>(dst_rgb) = b0 | (g0 << 5) | (r0 << 11) |
+ (b1 << 16) | (g1 << 21) | (r1 << 27);
+ dst_rgb += 4;
+ src_argb += 8;
+ }
+ if (width & 1) {
+ uint8 b0 = src_argb[0] >> 3;
+ uint8 g0 = src_argb[1] >> 2;
+ uint8 r0 = src_argb[2] >> 3;
+ *reinterpret_cast<uint16*>(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+ }
+}
+
+void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ uint8 b0 = src_argb[0] >> 3;
+ uint8 g0 = src_argb[1] >> 3;
+ uint8 r0 = src_argb[2] >> 3;
+ uint8 a0 = src_argb[3] >> 7;
+ uint8 b1 = src_argb[4] >> 3;
+ uint8 g1 = src_argb[5] >> 3;
+ uint8 r1 = src_argb[6] >> 3;
+ uint8 a1 = src_argb[7] >> 7;
+ *reinterpret_cast<uint32*>(dst_rgb) =
+ b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
+ (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
+ dst_rgb += 4;
+ src_argb += 8;
+ }
+ if (width & 1) {
+ uint8 b0 = src_argb[0] >> 3;
+ uint8 g0 = src_argb[1] >> 3;
+ uint8 r0 = src_argb[2] >> 3;
+ uint8 a0 = src_argb[3] >> 7;
+ *reinterpret_cast<uint16*>(dst_rgb) =
+ b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
+ }
+}
+
+void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ uint8 b0 = src_argb[0] >> 4;
+ uint8 g0 = src_argb[1] >> 4;
+ uint8 r0 = src_argb[2] >> 4;
+ uint8 a0 = src_argb[3] >> 4;
+ uint8 b1 = src_argb[4] >> 4;
+ uint8 g1 = src_argb[5] >> 4;
+ uint8 r1 = src_argb[6] >> 4;
+ uint8 a1 = src_argb[7] >> 4;
+ *reinterpret_cast<uint32*>(dst_rgb) =
+ b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
+ (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
+ dst_rgb += 4;
+ src_argb += 8;
+ }
+ if (width & 1) {
+ uint8 b0 = src_argb[0] >> 4;
+ uint8 g0 = src_argb[1] >> 4;
+ uint8 r0 = src_argb[2] >> 4;
+ uint8 a0 = src_argb[3] >> 4;
+ *reinterpret_cast<uint16*>(dst_rgb) =
+ b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
+ }
+}
+
+static __inline int RGBToY(uint8 r, uint8 g, uint8 b) {
+ return (( 66 * r + 129 * g + 25 * b + 128) >> 8) + 16;
+}
+
+static __inline int RGBToU(uint8 r, uint8 g, uint8 b) {
+ return ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128;
+}
+static __inline int RGBToV(uint8 r, uint8 g, uint8 b) {
+ return ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128;
+}
+
+#define MAKEROWY(NAME, R, G, B) \
+void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \
+ for (int x = 0; x < width; ++x) { \
+ dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \
+ src_argb0 += 4; \
+ dst_y += 1; \
+ } \
+} \
+void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb, \
+ uint8* dst_u, uint8* dst_v, int width) { \
+ const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \
+ for (int x = 0; x < width - 1; x += 2) { \
+ uint8 ab = (src_rgb0[B] + src_rgb0[B + 4] + \
+ src_rgb1[B] + src_rgb1[B + 4]) >> 2; \
+ uint8 ag = (src_rgb0[G] + src_rgb0[G + 4] + \
+ src_rgb1[G] + src_rgb1[G + 4]) >> 2; \
+ uint8 ar = (src_rgb0[R] + src_rgb0[R + 4] + \
+ src_rgb1[R] + src_rgb1[R + 4]) >> 2; \
+ dst_u[0] = RGBToU(ar, ag, ab); \
+ dst_v[0] = RGBToV(ar, ag, ab); \
+ src_rgb0 += 8; \
+ src_rgb1 += 8; \
+ dst_u += 1; \
+ dst_v += 1; \
+ } \
+ if (width & 1) { \
+ uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \
+ uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \
+ uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \
+ dst_u[0] = RGBToU(ar, ag, ab); \
+ dst_v[0] = RGBToV(ar, ag, ab); \
+ } \
+}
+
+MAKEROWY(ARGB, 2, 1, 0)
+MAKEROWY(BGRA, 1, 2, 3)
+MAKEROWY(ABGR, 0, 1, 2)
+MAKEROWY(RGBA, 3, 2, 1)
+
+// http://en.wikipedia.org/wiki/Grayscale.
+// 0.11 * B + 0.59 * G + 0.30 * R
+// Coefficients rounded to multiple of 2 for consistency with SSSE3 version.
+static __inline int RGBToGray(uint8 r, uint8 g, uint8 b) {
+ return (( 76 * r + 152 * g + 28 * b) >> 8);
+}
+
+void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+ for (int x = 0; x < width; ++x) {
+ uint8 y = RGBToGray(src_argb[2], src_argb[1], src_argb[0]);
+ dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
+ dst_argb[3] = src_argb[3];
+ dst_argb += 4;
+ src_argb += 4;
+ }
+}
+
+// Convert a row of image to Sepia tone.
+void ARGBSepiaRow_C(uint8* dst_argb, int width) {
+ for (int x = 0; x < width; ++x) {
+ int b = dst_argb[0];
+ int g = dst_argb[1];
+ int r = dst_argb[2];
+ int sb = (b * 17 + g * 68 + r * 35) >> 7;
+ int sg = (b * 22 + g * 88 + r * 45) >> 7;
+ int sr = (b * 24 + g * 98 + r * 50) >> 7;
+ // b does not over flow. a is preserved from original.
+ if (sg > 255) {
+ sg = 255;
+ }
+ if (sr > 255) {
+ sr = 255;
+ }
+ dst_argb[0] = sb;
+ dst_argb[1] = sg;
+ dst_argb[2] = sr;
+ dst_argb += 4;
+ }
+}
+
+// Apply color matrix to a row of image. Matrix is signed.
+void ARGBColorMatrixRow_C(uint8* dst_argb, const int8* matrix_argb, int width) {
+ for (int x = 0; x < width; ++x) {
+ int b = dst_argb[0];
+ int g = dst_argb[1];
+ int r = dst_argb[2];
+ int a = dst_argb[3];
+ int sb = (b * matrix_argb[0] + g * matrix_argb[1] +
+ r * matrix_argb[2] + a * matrix_argb[3]) >> 7;
+ int sg = (b * matrix_argb[4] + g * matrix_argb[5] +
+ r * matrix_argb[6] + a * matrix_argb[7]) >> 7;
+ int sr = (b * matrix_argb[8] + g * matrix_argb[9] +
+ r * matrix_argb[10] + a * matrix_argb[11]) >> 7;
+ if (sb < 0) {
+ sb = 0;
+ }
+ if (sb > 255) {
+ sb = 255;
+ }
+ if (sg < 0) {
+ sg = 0;
+ }
+ if (sg > 255) {
+ sg = 255;
+ }
+ if (sr < 0) {
+ sr = 0;
+ }
+ if (sr > 255) {
+ sr = 255;
+ }
+ dst_argb[0] = sb;
+ dst_argb[1] = sg;
+ dst_argb[2] = sr;
+ dst_argb += 4;
+ }
+}
+
+// Apply color table to a row of image.
+void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
+ for (int x = 0; x < width; ++x) {
+ int b = dst_argb[0];
+ int g = dst_argb[1];
+ int r = dst_argb[2];
+ int a = dst_argb[3];
+ dst_argb[0] = table_argb[b * 4 + 0];
+ dst_argb[1] = table_argb[g * 4 + 1];
+ dst_argb[2] = table_argb[r * 4 + 2];
+ dst_argb[3] = table_argb[a * 4 + 3];
+ dst_argb += 4;
+ }
+}
+
+void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
+ int interval_offset, int width) {
+ for (int x = 0; x < width; ++x) {
+ int b = dst_argb[0];
+ int g = dst_argb[1];
+ int r = dst_argb[2];
+ dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
+ dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset;
+ dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset;
+ dst_argb += 4;
+ }
+}
+
+void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
+ // Copy a Y to RGB.
+ for (int x = 0; x < width; ++x) {
+ uint8 y = src_y[0];
+ dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
+ dst_argb[3] = 255u;
+ dst_argb += 4;
+ ++src_y;
+ }
+}
+
+// C reference code that mimics the YUV assembly.
+
+#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
+
+#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
+#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
+#define UR 0
+
+#define VB 0
+#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
+#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
+
+// Bias
+#define BB UB * 128 + VB * 128
+#define BG UG * 128 + VG * 128
+#define BR UR * 128 + VR * 128
+
+static __inline uint32 Clip(int32 val) {
+ if (val < 0) {
+ return static_cast<uint32>(0);
+ } else if (val > 255) {
+ return static_cast<uint32>(255);
+ }
+ return static_cast<uint32>(val);
+}
+
+static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, uint8* rgb_buf,
+ int ashift, int rshift, int gshift, int bshift) {
+ int32 y1 = (static_cast<int32>(y) - 16) * YG;
+ uint32 b = Clip(static_cast<int32>((u * UB + v * VB) - (BB) + y1) >> 6);
+ uint32 g = Clip(static_cast<int32>((u * UG + v * VG) - (BG) + y1) >> 6);
+ uint32 r = Clip(static_cast<int32>((u * UR + v * VR) - (BR) + y1) >> 6);
+ *reinterpret_cast<uint32*>(rgb_buf) = (b << bshift) |
+ (g << gshift) |
+ (r << rshift) |
+ (255u << ashift);
+}
+
+static __inline void YuvPixel2(uint8 y, uint8 u, uint8 v,
+ uint8* b, uint8* g, uint8* r) {
+ int32 y1 = (static_cast<int32>(y) - 16) * YG;
+ *b = Clip(static_cast<int32>((u * UB + v * VB) - (BB) + y1) >> 6);
+ *g = Clip(static_cast<int32>((u * UG + v * VG) - (BG) + y1) >> 6);
+ *r = Clip(static_cast<int32>((u * UR + v * VR) - (BR) + y1) >> 6);
+}
+
+void I444ToARGBRow_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+ for (int x = 0; x < width; ++x) {
+ YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf, 24, 16, 8, 0);
+ y_buf += 1;
+ u_buf += 1;
+ v_buf += 1;
+ rgb_buf += 4; // Advance 1 pixel.
+ }
+}
+
+// Also used for 420
+void I422ToARGBRow_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
+ YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0);
+ y_buf += 2;
+ u_buf += 1;
+ v_buf += 1;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
+ }
+}
+
+void I422ToRGB24Row_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ YuvPixel2(y_buf[1], u_buf[0], v_buf[0],
+ rgb_buf + 3, rgb_buf + 4, rgb_buf + 5);
+ y_buf += 2;
+ u_buf += 1;
+ v_buf += 1;
+ rgb_buf += 6; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ }
+}
+
+void I422ToRAWRow_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
+ rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
+ YuvPixel2(y_buf[1], u_buf[0], v_buf[0],
+ rgb_buf + 5, rgb_buf + 4, rgb_buf + 3);
+ y_buf += 2;
+ u_buf += 1;
+ v_buf += 1;
+ rgb_buf += 6; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ }
+}
+
+void I411ToARGBRow_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+ for (int x = 0; x < width - 3; x += 4) {
+ YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
+ YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0);
+ YuvPixel(y_buf[2], u_buf[0], v_buf[0], rgb_buf + 8, 24, 16, 8, 0);
+ YuvPixel(y_buf[3], u_buf[0], v_buf[0], rgb_buf + 12, 24, 16, 8, 0);
+ y_buf += 4;
+ u_buf += 1;
+ v_buf += 1;
+ rgb_buf += 16; // Advance 4 pixels.
+ }
+ if (width & 2) {
+ YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
+ YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0);
+ y_buf += 2;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
+ }
+}
+
+void NV12ToARGBRow_C(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* rgb_buf,
+ int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ YuvPixel(y_buf[0], uv_buf[0], uv_buf[1], rgb_buf + 0, 24, 16, 8, 0);
+ YuvPixel(y_buf[1], uv_buf[0], uv_buf[1], rgb_buf + 4, 24, 16, 8, 0);
+ y_buf += 2;
+ uv_buf += 2;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(y_buf[0], uv_buf[0], uv_buf[1], rgb_buf + 0, 24, 16, 8, 0);
+ }
+}
+
+void NV21ToARGBRow_C(const uint8* y_buf,
+ const uint8* vu_buf,
+ uint8* rgb_buf,
+ int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ YuvPixel(y_buf[0], vu_buf[1], vu_buf[0], rgb_buf + 0, 24, 16, 8, 0);
+ YuvPixel(y_buf[1], vu_buf[1], vu_buf[0], rgb_buf + 4, 24, 16, 8, 0);
+ y_buf += 2;
+ vu_buf += 2;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(y_buf[0], vu_buf[1], vu_buf[0], rgb_buf + 0, 24, 16, 8, 0);
+ }
+}
+
+void I422ToBGRARow_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 0, 8, 16, 24);
+ YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 0, 8, 16, 24);
+ y_buf += 2;
+ u_buf += 1;
+ v_buf += 1;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf, 0, 8, 16, 24);
+ }
+}
+
+void I422ToABGRRow_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 0, 8, 16);
+ YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 0, 8, 16);
+ y_buf += 2;
+ u_buf += 1;
+ v_buf += 1;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 0, 8, 16);
+ }
+}
+
+void I422ToRGBARow_C(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 0, 24, 16, 8);
+ YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 0, 24, 16, 8);
+ y_buf += 2;
+ u_buf += 1;
+ v_buf += 1;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 0, 24, 16, 8);
+ }
+}
+
+void YToARGBRow_C(const uint8* y_buf, uint8* rgb_buf, int width) {
+ for (int x = 0; x < width; ++x) {
+ YuvPixel(y_buf[0], 128, 128, rgb_buf, 24, 16, 8, 0);
+ y_buf += 1;
+ rgb_buf += 4; // Advance 1 pixel.
+ }
+}
+
+void MirrorRow_C(const uint8* src, uint8* dst, int width) {
+ src += width - 1;
+ for (int x = 0; x < width - 1; x += 2) {
+ dst[x] = src[0];
+ dst[x + 1] = src[-1];
+ src -= 2;
+ }
+ if (width & 1) {
+ dst[width - 1] = src[0];
+ }
+}
+
+void MirrorRowUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+ src_uv += (width - 1) << 1;
+ for (int x = 0; x < width - 1; x += 2) {
+ dst_u[x] = src_uv[0];
+ dst_u[x + 1] = src_uv[-2];
+ dst_v[x] = src_uv[1];
+ dst_v[x + 1] = src_uv[-2 + 1];
+ src_uv -= 4;
+ }
+ if (width & 1) {
+ dst_u[width - 1] = src_uv[0];
+ dst_v[width - 1] = src_uv[1];
+ }
+}
+
+void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) {
+ const uint32* src32 = reinterpret_cast<const uint32*>(src);
+ uint32* dst32 = reinterpret_cast<uint32*>(dst);
+ src32 += width - 1;
+ for (int x = 0; x < width - 1; x += 2) {
+ dst32[x] = src32[0];
+ dst32[x + 1] = src32[-1];
+ src32 -= 2;
+ }
+ if (width & 1) {
+ dst32[width - 1] = src32[0];
+ }
+}
+
+void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ dst_u[x] = src_uv[0];
+ dst_u[x + 1] = src_uv[2];
+ dst_v[x] = src_uv[1];
+ dst_v[x + 1] = src_uv[3];
+ src_uv += 4;
+ }
+ if (width & 1) {
+ dst_u[width - 1] = src_uv[0];
+ dst_v[width - 1] = src_uv[1];
+ }
+}
+
+void CopyRow_C(const uint8* src, uint8* dst, int count) {
+ memcpy(dst, src, count);
+}
+
+void SetRow8_C(uint8* dst, uint32 v8, int count) {
+#ifdef _MSC_VER
+ // VC will generate rep stosb.
+ for (int x = 0; x < count; ++x) {
+ dst[x] = v8;
+ }
+#else
+ memset(dst, v8, count);
+#endif
+}
+
+void SetRows32_C(uint8* dst, uint32 v32, int width,
+ int dst_stride, int height) {
+ for (int y = 0; y < height; ++y) {
+ uint32* d = reinterpret_cast<uint32*>(dst);
+ for (int x = 0; x < width; ++x) {
+ d[x] = v32;
+ }
+ dst += dst_stride;
+ }
+}
+
+// Filter 2 rows of YUY2 UV's (422) into U and V (420).
+void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int width) {
+ // Output a row of UV values, filtering 2 rows of YUY2.
+ for (int x = 0; x < width; x += 2) {
+ dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
+ dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
+ src_yuy2 += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+}
+
+// Copy row of YUY2 UV's (422) into U and V (422).
+void YUY2ToUV422Row_C(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int width) {
+ // Output a row of UV values.
+ for (int x = 0; x < width; x += 2) {
+ dst_u[0] = src_yuy2[1];
+ dst_v[0] = src_yuy2[3];
+ src_yuy2 += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+}
+
+// Copy row of YUY2 Y's (422) into Y (420/422).
+void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
+ // Output a row of Y values.
+ for (int x = 0; x < width - 1; x += 2) {
+ dst_y[x] = src_yuy2[0];
+ dst_y[x + 1] = src_yuy2[2];
+ src_yuy2 += 4;
+ }
+ if (width & 1) {
+ dst_y[width - 1] = src_yuy2[0];
+ }
+}
+
+// Filter 2 rows of UYVY UV's (422) into U and V (420).
+void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int width) {
+ // Output a row of UV values.
+ for (int x = 0; x < width; x += 2) {
+ dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
+ dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
+ src_uyvy += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+}
+
+// Copy row of UYVY UV's (422) into U and V (422).
+void UYVYToUV422Row_C(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int width) {
+ // Output a row of UV values.
+ for (int x = 0; x < width; x += 2) {
+ dst_u[0] = src_uyvy[0];
+ dst_v[0] = src_uyvy[2];
+ src_uyvy += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+}
+
+// Copy row of UYVY Y's (422) into Y (420/422).
+void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) {
+ // Output a row of Y values.
+ for (int x = 0; x < width - 1; x += 2) {
+ dst_y[x] = src_uyvy[1];
+ dst_y[x + 1] = src_uyvy[3];
+ src_uyvy += 4;
+ }
+ if (width & 1) {
+ dst_y[width - 1] = src_uyvy[1];
+ }
+}
+
+#define BLEND(f, b, a) (((256 - a) * b) >> 8) + f
+
+// Blend src_argb0 over src_argb1 and store to dst_argb.
+// dst_argb may be src_argb0 or src_argb1.
+// This code mimics the SSSE3 version for better testability.
+void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ uint32 fb = src_argb0[0];
+ uint32 fg = src_argb0[1];
+ uint32 fr = src_argb0[2];
+ uint32 a = src_argb0[3];
+ uint32 bb = src_argb1[0];
+ uint32 bg = src_argb1[1];
+ uint32 br = src_argb1[2];
+ dst_argb[0] = BLEND(fb, bb, a);
+ dst_argb[1] = BLEND(fg, bg, a);
+ dst_argb[2] = BLEND(fr, br, a);
+ dst_argb[3] = 255u;
+
+ fb = src_argb0[4 + 0];
+ fg = src_argb0[4 + 1];
+ fr = src_argb0[4 + 2];
+ a = src_argb0[4 + 3];
+ bb = src_argb1[4 + 0];
+ bg = src_argb1[4 + 1];
+ br = src_argb1[4 + 2];
+ dst_argb[4 + 0] = BLEND(fb, bb, a);
+ dst_argb[4 + 1] = BLEND(fg, bg, a);
+ dst_argb[4 + 2] = BLEND(fr, br, a);
+ dst_argb[4 + 3] = 255u;
+ src_argb0 += 8;
+ src_argb1 += 8;
+ dst_argb += 8;
+ }
+
+ if (width & 1) {
+ uint32 fb = src_argb0[0];
+ uint32 fg = src_argb0[1];
+ uint32 fr = src_argb0[2];
+ uint32 a = src_argb0[3];
+ uint32 bb = src_argb1[0];
+ uint32 bg = src_argb1[1];
+ uint32 br = src_argb1[2];
+ dst_argb[0] = BLEND(fb, bb, a);
+ dst_argb[1] = BLEND(fg, bg, a);
+ dst_argb[2] = BLEND(fr, br, a);
+ dst_argb[3] = 255u;
+ }
+}
+#undef BLEND
+#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
+
+// Multiply source RGB by alpha and store to destination.
+// This code mimics the SSSE3 version for better testability.
+void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+ for (int i = 0; i < width - 1; i += 2) {
+ uint32 b = src_argb[0];
+ uint32 g = src_argb[1];
+ uint32 r = src_argb[2];
+ uint32 a = src_argb[3];
+ dst_argb[0] = ATTENUATE(b, a);
+ dst_argb[1] = ATTENUATE(g, a);
+ dst_argb[2] = ATTENUATE(r, a);
+ dst_argb[3] = a;
+ b = src_argb[4];
+ g = src_argb[5];
+ r = src_argb[6];
+ a = src_argb[7];
+ dst_argb[4] = ATTENUATE(b, a);
+ dst_argb[5] = ATTENUATE(g, a);
+ dst_argb[6] = ATTENUATE(r, a);
+ dst_argb[7] = a;
+ src_argb += 8;
+ dst_argb += 8;
+ }
+
+ if (width & 1) {
+ const uint32 b = src_argb[0];
+ const uint32 g = src_argb[1];
+ const uint32 r = src_argb[2];
+ const uint32 a = src_argb[3];
+ dst_argb[0] = ATTENUATE(b, a);
+ dst_argb[1] = ATTENUATE(g, a);
+ dst_argb[2] = ATTENUATE(r, a);
+ dst_argb[3] = a;
+ }
+}
+#undef ATTENUATE
+
+// Divide source RGB by alpha and store to destination.
+// b = (b * 255 + (a / 2)) / a;
+// g = (g * 255 + (a / 2)) / a;
+// r = (r * 255 + (a / 2)) / a;
+// Reciprocal method is off by 1 on some values. ie 125
+// 8.16 fixed point inverse table
+#define T(a) 0x10000 / a
+uint32 fixed_invtbl8[256] = {
+ 0x0100, T(0x01), T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
+ T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
+ T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
+ T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
+ T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),
+ T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),
+ T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
+ T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),
+ T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),
+ T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),
+ T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),
+ T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),
+ T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),
+ T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
+ T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),
+ T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),
+ T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),
+ T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),
+ T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),
+ T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),
+ T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
+ T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),
+ T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),
+ T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),
+ T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),
+ T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),
+ T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),
+ T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
+ T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
+ T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
+ T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
+ T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x0100 };
+#undef T
+
+void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+ for (int i = 0; i < width; ++i) {
+ uint32 b = src_argb[0];
+ uint32 g = src_argb[1];
+ uint32 r = src_argb[2];
+ const uint32 a = src_argb[3];
+ if (a) {
+ const uint32 ia = fixed_invtbl8[a]; // 8.16 fixed point
+ b = (b * ia) >> 8;
+ g = (g * ia) >> 8;
+ r = (r * ia) >> 8;
+ // Clamping should not be necessary but is free in assembly.
+ if (b > 255) {
+ b = 255;
+ }
+ if (g > 255) {
+ g = 255;
+ }
+ if (r > 255) {
+ r = 255;
+ }
+ }
+ dst_argb[0] = b;
+ dst_argb[1] = g;
+ dst_argb[2] = r;
+ dst_argb[3] = a;
+ src_argb += 4;
+ dst_argb += 4;
+ }
+}
+
+// Wrappers to handle odd width
+#define YANY(NAMEANY, I420TORGB_SSE, I420TORGB_C, UV_SHIFT) \
+ void NAMEANY(const uint8* y_buf, \
+ const uint8* u_buf, \
+ const uint8* v_buf, \
+ uint8* rgb_buf, \
+ int width) { \
+ int n = width & ~7; \
+ I420TORGB_SSE(y_buf, u_buf, v_buf, rgb_buf, n); \
+ I420TORGB_C(y_buf + n, \
+ u_buf + (n >> UV_SHIFT), \
+ v_buf + (n >> UV_SHIFT), \
+ rgb_buf + n * 4, width & 7); \
+ }
+
+// Wrappers to handle odd width
+#define Y2NY(NAMEANY, NV12TORGB_SSE, NV12TORGB_C, UV_SHIFT) \
+ void NAMEANY(const uint8* y_buf, \
+ const uint8* uv_buf, \
+ uint8* rgb_buf, \
+ int width) { \
+ int n = width & ~7; \
+ NV12TORGB_SSE(y_buf, uv_buf, rgb_buf, n); \
+ NV12TORGB_C(y_buf + n, \
+ uv_buf + (n >> UV_SHIFT), \
+ rgb_buf + n * 4, width & 7); \
+ }
+
+
+#ifdef HAS_I422TOARGBROW_SSSE3
+YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C, 0)
+YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C, 1)
+YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C, 2)
+Y2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_Unaligned_SSSE3, NV12ToARGBRow_C, 0)
+Y2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C, 0)
+YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, 1)
+YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, 1)
+#endif
+#ifdef HAS_I422TORGB24ROW_SSSE3
+YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_Unaligned_SSSE3, \
+ I422ToRGB24Row_C, 1)
+YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_Unaligned_SSSE3, I422ToRAWRow_C, 1)
+#endif
+#ifdef HAS_I422TORGBAROW_SSSE3
+YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C, 1)
+#endif
+#ifdef HAS_I422TOARGBROW_NEON
+YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1)
+YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C, 1)
+YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C, 1)
+YANY(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, I422ToRGBARow_C, 1)
+Y2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0)
+Y2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0)
+YANY(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, I422ToRGB24Row_C, 1)
+YANY(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, I422ToRAWRow_C, 1)
+#endif
+#undef YANY
+
+#define RGBANY(NAMEANY, ARGBTORGB, BPP) \
+ void NAMEANY(const uint8* argb_buf, \
+ uint8* rgb_buf, \
+ int width) { \
+ SIMD_ALIGNED(uint8 row[kMaxStride]); \
+ ARGBTORGB(argb_buf, row, width); \
+ memcpy(rgb_buf, row, width * BPP); \
+ }
+
+#if defined(HAS_ARGBTORGB24ROW_SSSE3)
+RGBANY(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 3)
+RGBANY(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 3)
+RGBANY(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 2)
+RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 2)
+RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 2)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 3)
+RGBANY(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 3)
+#endif
+#undef RGBANY
+
+#define YANY(NAMEANY, ARGBTOY_SSE, BPP) \
+ void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \
+ ARGBTOY_SSE(src_argb, dst_y, width - 16); \
+ ARGBTOY_SSE(src_argb + (width - 16) * BPP, dst_y + (width - 16), 16); \
+ }
+
+#ifdef HAS_ARGBTOYROW_SSSE3
+YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4)
+YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4)
+YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4)
+#endif
+#ifdef HAS_RGBATOYROW_SSSE3
+YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4)
+#endif
+#ifdef HAS_YUY2TOYROW_SSE2
+YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2)
+YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2)
+#endif
+#ifdef HAS_YUY2TOYROW_NEON
+YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2)
+YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2)
+#endif
+#undef YANY
+
+#define UVANY(NAMEANY, ANYTOUV_SSE, ANYTOUV_C, BPP) \
+ void NAMEANY(const uint8* src_argb, int src_stride_argb, \
+ uint8* dst_u, uint8* dst_v, int width) { \
+ int n = width & ~15; \
+ ANYTOUV_SSE(src_argb, src_stride_argb, dst_u, dst_v, n); \
+ ANYTOUV_C(src_argb + n * BPP, src_stride_argb, \
+ dst_u + (n >> 1), \
+ dst_v + (n >> 1), \
+ width & 15); \
+ }
+
+#ifdef HAS_ARGBTOUVROW_SSSE3
+UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4)
+UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4)
+UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4)
+#endif
+#ifdef HAS_RGBATOYROW_SSSE3
+UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4)
+#endif
+#ifdef HAS_YUY2TOUVROW_SSE2
+UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2)
+UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2)
+#endif
+#ifdef HAS_YUY2TOUVROW_NEON
+UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2)
+UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2)
+#endif
+#undef UVANY
+
+#define UV422ANY(NAMEANY, ANYTOUV_SSE, ANYTOUV_C, BPP) \
+ void NAMEANY(const uint8* src_argb, \
+ uint8* dst_u, uint8* dst_v, int width) { \
+ int n = width & ~15; \
+ ANYTOUV_SSE(src_argb, dst_u, dst_v, n); \
+ ANYTOUV_C(src_argb + n * BPP, \
+ dst_u + (n >> 1), \
+ dst_v + (n >> 1), \
+ width & 15); \
+ }
+
+#ifdef HAS_YUY2TOUV422ROW_SSE2
+UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_Unaligned_SSE2, \
+ YUY2ToUV422Row_C, 2)
+UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_Unaligned_SSE2, \
+ UYVYToUV422Row_C, 2)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_NEON
+UV422ANY(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, \
+ YUY2ToUV422Row_C, 2)
+UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, \
+ UYVYToUV422Row_C, 2)
+#endif
+#undef UV422ANY
+
+void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
+ const int32* previous_cumsum, int width) {
+ int32 row_sum[4] = {0, 0, 0, 0};
+ for (int x = 0; x < width; ++x) {
+ row_sum[0] += row[x * 4 + 0];
+ row_sum[1] += row[x * 4 + 1];
+ row_sum[2] += row[x * 4 + 2];
+ row_sum[3] += row[x * 4 + 3];
+ cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0];
+ cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1];
+ cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2];
+ cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3];
+ }
+}
+
+void CumulativeSumToAverage_C(const int32* tl, const int32* bl,
+ int w, int area, uint8* dst, int count) {
+ float ooa = 1.0f / area;
+ for (int i = 0; i < count; ++i) {
+ dst[0] = static_cast<uint8>((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
+ dst[1] = static_cast<uint8>((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
+ dst[2] = static_cast<uint8>((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
+ dst[3] = static_cast<uint8>((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
+ dst += 4;
+ tl += 4;
+ bl += 4;
+ }
+}
+
+#define REPEAT8(v) (v) | ((v) << 8)
+#define SHADE(f, v) v * f >> 24
+
+void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+ uint32 value) {
+ const uint32 b_scale = REPEAT8(value & 0xff);
+ const uint32 g_scale = REPEAT8((value >> 8) & 0xff);
+ const uint32 r_scale = REPEAT8((value >> 16) & 0xff);
+ const uint32 a_scale = REPEAT8(value >> 24);
+
+ for (int i = 0; i < width; ++i) {
+ const uint32 b = REPEAT8(src_argb[0]);
+ const uint32 g = REPEAT8(src_argb[1]);
+ const uint32 r = REPEAT8(src_argb[2]);
+ const uint32 a = REPEAT8(src_argb[3]);
+ dst_argb[0] = SHADE(b, b_scale);
+ dst_argb[1] = SHADE(g, g_scale);
+ dst_argb[2] = SHADE(r, r_scale);
+ dst_argb[3] = SHADE(a, a_scale);
+ src_argb += 4;
+ dst_argb += 4;
+ }
+}
+#undef REPEAT8
+#undef SHADE
+
+// Copy pixels from rotated source to destination row with a slope.
+LIBYUV_API
+void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
+ uint8* dst_argb, const float* uv_dudv, int width) {
+ // Render a row of pixels from source into a buffer.
+ float uv[2];
+ uv[0] = uv_dudv[0];
+ uv[1] = uv_dudv[1];
+ for (int i = 0; i < width; ++i) {
+ int x = static_cast<int>(uv[0]);
+ int y = static_cast<int>(uv[1]);
+ *reinterpret_cast<uint32*>(dst_argb) =
+ *reinterpret_cast<const uint32*>(src_argb + y * src_argb_stride +
+ x * 4);
+ dst_argb += 4;
+ uv[0] += uv_dudv[2];
+ uv[1] += uv_dudv[3];
+ }
+}
+
+// C version 2x2 -> 2x1.
+void ARGBInterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width, int source_y_fraction) {
+ int y1_fraction = source_y_fraction;
+ int y0_fraction = 256 - y1_fraction;
+ const uint8* src_ptr1 = src_ptr + src_stride;
+ uint8* end = dst_ptr + (dst_width << 2);
+ do {
+ dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+ dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
+ dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
+ dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
+ dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8;
+ dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8;
+ dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8;
+ dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8;
+ src_ptr += 8;
+ src_ptr1 += 8;
+ dst_ptr += 8;
+ } while (dst_ptr < end);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/row_neon.cc b/files/source/row_neon.cc
new file mode 100644
index 00000000..19a78330
--- /dev/null
+++ b/files/source/row_neon.cc
@@ -0,0 +1,829 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
+
+// Read 8 Y, 4 U and 4 V from 422
+#define READYUV422 \
+ "vld1.u8 {d0}, [%0]! \n" \
+ "vld1.u32 {d2[0]}, [%1]! \n" \
+ "vld1.u32 {d2[1]}, [%2]! \n"
+
+// Read 8 Y and 4 UV from NV12
+#define READNV12 \
+ "vld1.u8 {d0}, [%0]! \n" \
+ "vld1.u8 {d2}, [%1]! \n" \
+ "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
+ "vuzp.u8 d2, d3 \n" \
+ "vtrn.u32 d2, d3 \n" \
+
+// Read 8 Y and 4 VU from NV21
+#define READNV21 \
+ "vld1.u8 {d0}, [%0]! \n" \
+ "vld1.u8 {d2}, [%1]! \n" \
+ "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
+ "vuzp.u8 d3, d2 \n" \
+ "vtrn.u32 d2, d3 \n" \
+
+#define YUV422TORGB \
+ "veor.u8 d2, d26 \n"/*subtract 128 from u and v*/\
+ "vmull.s8 q8, d2, d24 \n"/* u/v B/R component */\
+ "vmull.s8 q9, d2, d25 \n"/* u/v G component */\
+ "vmov.u8 d1, #0 \n"/* split odd/even y apart */\
+ "vtrn.u8 d0, d1 \n" \
+ "vsub.s16 q0, q0, q15 \n"/* offset y */\
+ "vmul.s16 q0, q0, q14 \n" \
+ "vadd.s16 d18, d19 \n" \
+ "vqadd.s16 d20, d0, d16 \n" \
+ "vqadd.s16 d21, d1, d16 \n" \
+ "vqadd.s16 d22, d0, d17 \n" \
+ "vqadd.s16 d23, d1, d17 \n" \
+ "vqadd.s16 d16, d0, d18 \n" \
+ "vqadd.s16 d17, d1, d18 \n" \
+ "vqrshrun.s16 d0, q10, #6 \n" \
+ "vqrshrun.s16 d1, q11, #6 \n" \
+ "vqrshrun.s16 d2, q8, #6 \n" \
+ "vmovl.u8 q10, d0 \n"/* set up for reinterleave*/\
+ "vmovl.u8 q11, d1 \n" \
+ "vmovl.u8 q8, d2 \n" \
+ "vtrn.u8 d20, d21 \n" \
+ "vtrn.u8 d22, d23 \n" \
+ "vtrn.u8 d16, d17 \n" \
+ "vmov.u8 d21, d16 \n"
+
+#if defined(HAS_I422TOARGBROW_NEON) || defined(HAS_I422TOBGRAROW_NEON) || \
+ defined(HAS_I422TOABGRROW_NEON) || defined(HAS_I422TORGBAROW_NEON)
+static const vec8 kUVToRB = { 127, 127, 127, 127, 102, 102, 102, 102,
+ 0, 0, 0, 0, 0, 0, 0, 0 };
+static const vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52,
+ 0, 0, 0, 0, 0, 0, 0, 0 };
+#endif
+
+#ifdef HAS_I422TOARGBROW_NEON
+void I422ToARGBRow_NEON(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+ asm volatile (
+ "vld1.u8 {d24}, [%5] \n"
+ "vld1.u8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV422
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(y_buf), // %0
+ "+r"(u_buf), // %1
+ "+r"(v_buf), // %2
+ "+r"(rgb_buf), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_I422TOARGBROW_NEON
+
+#ifdef HAS_I422TOBGRAROW_NEON
+void I422ToBGRARow_NEON(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+ asm volatile (
+ "vld1.u8 {d24}, [%5] \n"
+ "vld1.u8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV422
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vswp.u8 d20, d22 \n"
+ "vmov.u8 d19, #255 \n"
+ "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(y_buf), // %0
+ "+r"(u_buf), // %1
+ "+r"(v_buf), // %2
+ "+r"(rgb_buf), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_I422TOBGRAROW_NEON
+
+#ifdef HAS_I422TOABGRROW_NEON
+void I422ToABGRRow_NEON(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+ asm volatile (
+ "vld1.u8 {d24}, [%5] \n"
+ "vld1.u8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV422
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vswp.u8 d20, d22 \n"
+ "vmov.u8 d23, #255 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(y_buf), // %0
+ "+r"(u_buf), // %1
+ "+r"(v_buf), // %2
+ "+r"(rgb_buf), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_I422TOABGRROW_NEON
+
+#ifdef HAS_I422TORGBAROW_NEON
+void I422ToRGBARow_NEON(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+ asm volatile (
+ "vld1.u8 {d24}, [%5] \n"
+ "vld1.u8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV422
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d19, #255 \n"
+ "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(y_buf), // %0
+ "+r"(u_buf), // %1
+ "+r"(v_buf), // %2
+ "+r"(rgb_buf), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_I422TORGBAROW_NEON
+
+#ifdef HAS_I422TORGB24ROW_NEON
+void I422ToRGB24Row_NEON(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+ asm volatile (
+ "vld1.u8 {d24}, [%5] \n"
+ "vld1.u8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV422
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vst3.8 {d20, d21, d22}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(y_buf), // %0
+ "+r"(u_buf), // %1
+ "+r"(v_buf), // %2
+ "+r"(rgb_buf), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_I422TORGB24ROW_NEON
+
+#ifdef HAS_I422TORAWROW_NEON
+void I422ToRAWRow_NEON(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+ asm volatile (
+ "vld1.u8 {d24}, [%5] \n"
+ "vld1.u8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV422
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vswp.u8 d20, d22 \n"
+ "vst3.8 {d20, d21, d22}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(y_buf), // %0
+ "+r"(u_buf), // %1
+ "+r"(v_buf), // %2
+ "+r"(rgb_buf), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_I422TORAWROW_NEON
+
+#ifdef HAS_NV12TOARGBROW_NEON
+void NV12ToARGBRow_NEON(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* rgb_buf,
+ int width) {
+ asm volatile (
+ "vld1.u8 {d24}, [%4] \n"
+ "vld1.u8 {d25}, [%5] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READNV12
+ YUV422TORGB
+ "subs %3, %3, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(y_buf), // %0
+ "+r"(uv_buf), // %1
+ "+r"(rgb_buf), // %2
+ "+r"(width) // %3
+ : "r"(&kUVToRB), // %4
+ "r"(&kUVToG) // %5
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_NV12TOARGBROW_NEON
+
+#ifdef HAS_NV21TOARGBROW_NEON
+void NV21ToARGBRow_NEON(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* rgb_buf,
+ int width) {
+ asm volatile (
+ "vld1.u8 {d24}, [%4] \n"
+ "vld1.u8 {d25}, [%5] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READNV21
+ YUV422TORGB
+ "subs %3, %3, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(y_buf), // %0
+ "+r"(uv_buf), // %1
+ "+r"(rgb_buf), // %2
+ "+r"(width) // %3
+ : "r"(&kUVToRB), // %4
+ "r"(&kUVToG) // %5
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_NV21TOARGBROW_NEON
+
+#ifdef HAS_SPLITUV_NEON
+// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v
+// Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels.
+void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld2.u8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "vst1.u8 {q0}, [%1]! \n" // store U
+ "vst1.u8 {q1}, [%2]! \n" // Store V
+ "bgt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3 // Output registers
+ : // Input registers
+ : "memory", "cc", "q0", "q1" // Clobber List
+ );
+}
+#endif // HAS_SPLITUV_NEON
+
+#ifdef HAS_COPYROW_NEON
+// Copy multiple of 64
+void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vldm %0!, {q0, q1, q2, q3} \n" // load 64
+ "subs %2, %2, #64 \n" // 64 processed per loop
+ "vstm %1!, {q0, q1, q2, q3} \n" // store 64
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(count) // %2 // Output registers
+ : // Input registers
+ : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+#endif // HAS_COPYROW_NEON
+
+#ifdef HAS_SETROW_NEON
+// SetRow8 writes 'count' bytes using a 32 bit value repeated.
+void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
+ asm volatile ( // NOLINT
+ "vdup.u32 q0, %2 \n" // duplicate 4 ints
+ "1: \n"
+ "subs %1, %1, #16 \n" // 16 bytes per loop
+ "vst1.u32 {q0}, [%0]! \n" // store
+ "bgt 1b \n"
+ : "+r"(dst), // %0
+ "+r"(count) // %1
+ : "r"(v32) // %2
+ : "q0", "memory", "cc");
+}
+
+// TODO(fbarchard): Make fully assembler
+// SetRow32 writes 'count' words using a 32 bit value repeated.
+void SetRows32_NEON(uint8* dst, uint32 v32, int width,
+ int dst_stride, int height) {
+ for (int y = 0; y < height; ++y) {
+ SetRow8_NEON(dst, v32, width << 2);
+ dst += dst_stride;
+ }
+}
+#endif // HAS_SETROW_NEON
+
+#ifdef HAS_MIRRORROW_NEON
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ // compute where to start writing destination
+ "add %1, %2 \n"
+ // work on segments that are multiples of 16
+ "lsrs r3, %2, #4 \n"
+ // the output is written in two block. 8 bytes followed
+ // by another 8. reading is done sequentially, from left to
+ // right. writing is done from right to left in block sizes
+ // %1, the destination pointer is incremented after writing
+ // the first of the two blocks. need to subtract that 8 off
+ // along with 16 to get the next location.
+ "mov r3, #-24 \n"
+ "beq 2f \n"
+
+ // back of destination by the size of the register that is
+ // going to be mirrored
+ "sub %1, #16 \n"
+ // the loop needs to run on blocks of 16. what will be left
+ // over is either a negative number, the residuals that need
+ // to be done, or 0. If this isn't subtracted off here the
+ // loop will run one extra time.
+ "sub %2, #16 \n"
+
+ // mirror the bytes in the 64 bit segments. unable to mirror
+ // the bytes in the entire 128 bits in one go.
+ // because of the inability to mirror the entire 128 bits
+ // mirror the writing out of the two 64 bit segments.
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // src += 16
+ "subs %2, #16 \n"
+ "vrev64.8 q0, q0 \n"
+ "vst1.8 {d1}, [%1]! \n"
+ "vst1.8 {d0}, [%1], r3 \n" // dst -= 16
+ "bge 1b \n"
+
+ // add 16 back to the counter. if the result is 0 there is no
+ // residuals so jump past
+ "adds %2, #16 \n"
+ "beq 5f \n"
+ "add %1, #16 \n"
+ "2: \n"
+ "mov r3, #-3 \n"
+ "sub %1, #2 \n"
+ "subs %2, #2 \n"
+ // check for 16*n+1 scenarios where segments_of_2 should not
+ // be run, but there is something left over.
+ "blt 4f \n"
+
+// do this in neon registers as per
+// http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
+ "3: \n"
+ "vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2
+ "subs %2, #2 \n"
+ "vst1.8 {d1[0]}, [%1]! \n"
+ "vst1.8 {d0[0]}, [%1], r3 \n" // dst -= 2
+ "bge 3b \n"
+
+ "adds %2, #2 \n"
+ "beq 5f \n"
+ "4: \n"
+ "add %1, #1 \n"
+ "vld1.8 {d0[0]}, [%0] \n"
+ "vst1.8 {d0[0]}, [%1] \n"
+ "5: \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "r3", "q0"
+ );
+}
+#endif // HAS_MIRRORROW_NEON
+
+#ifdef HAS_MIRRORROWUV_NEON
+void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) {
+ asm volatile (
+ // compute where to start writing destination
+ "add %1, %3 \n" // dst_a + width
+ "add %2, %3 \n" // dst_b + width
+ // work on input segments that are multiples of 16, but
+ // width that has been passed is output segments, half
+ // the size of input.
+ "lsrs r12, %3, #3 \n"
+ "beq 2f \n"
+ // the output is written in to two blocks.
+ "mov r12, #-8 \n"
+ // back of destination by the size of the register that is
+ // going to be mirrord
+ "sub %1, #8 \n"
+ "sub %2, #8 \n"
+ // the loop needs to run on blocks of 8. what will be left
+ // over is either a negative number, the residuals that need
+ // to be done, or 0. if this isn't subtracted off here the
+ // loop will run one extra time.
+ "sub %3, #8 \n"
+
+ // mirror the bytes in the 64 bit segments
+ ".p2align 2 \n"
+ "1: \n"
+ "vld2.8 {d0, d1}, [%0]! \n" // src += 16
+ "subs %3, #8 \n"
+ "vrev64.8 q0, q0 \n"
+ "vst1.8 {d0}, [%1], r12 \n" // dst_a -= 8
+ "vst1.8 {d1}, [%2], r12 \n" // dst_b -= 8
+ "bge 1b \n"
+
+ // add 8 back to the counter. if the result is 0 there is no
+ // residuals so return
+ "adds %3, #8 \n"
+ "beq 4f \n"
+ "add %1, #8 \n"
+ "add %2, #8 \n"
+ "2: \n"
+ "mov r12, #-1 \n"
+ "sub %1, #1 \n"
+ "sub %2, #1 \n"
+ "3: \n"
+ "vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2
+ "subs %3, %3, #1 \n"
+ "vst1.8 {d0[0]}, [%1], r12 \n" // dst_a -= 1
+ "vst1.8 {d1[0]}, [%2], r12 \n" // dst_b -= 1
+ "bgt 3b \n"
+ "4: \n"
+ : "+r"(src), // %0
+ "+r"(dst_a), // %1
+ "+r"(dst_b), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "r12", "q0"
+ );
+}
+#endif // HAS_MIRRORROWUV_NEON
+
+#ifdef HAS_BGRATOARGBROW_NEON
+void BGRAToARGBRow_NEON(const uint8* src_bgra, uint8* dst_argb, int pix) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d2 \n" // swap G, R
+ "vswp.u8 d0, d3 \n" // swap B, A
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_bgra), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc", "d0", "d1", "d2", "d3" // Clobber List
+ );
+}
+#endif // HAS_BGRATOARGBROW_NEON
+
+#ifdef HAS_ABGRTOARGBROW_NEON
+void ABGRToARGBRow_NEON(const uint8* src_abgr, uint8* dst_argb, int pix) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d0, d2 \n" // swap R, B
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc", "d0", "d1", "d2", "d3" // Clobber List
+ );
+}
+#endif // HAS_ABGRTOARGBROW_NEON
+
+#ifdef HAS_RGBATOARGBROW_NEON
+void RGBAToARGBRow_NEON(const uint8* src_rgba, uint8* dst_argb, int pix) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmov.u8 d4, d0 \n" // move A after RGB
+ "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc", "d0", "d1", "d2", "d3", "d4" // Clobber List
+ );
+}
+#endif // HAS_RGBATOARGBROW_NEON
+
+#ifdef HAS_RGB24TOARGBROW_NEON
+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
+ asm volatile (
+ "vmov.u8 d4, #255 \n" // Alpha
+ ".p2align 2 \n"
+ "1: \n"
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc", "d1", "d2", "d3", "d4" // Clobber List
+ );
+}
+#endif // HAS_RGB24TOARGBROW_NEON
+
+#ifdef HAS_RAWTOARGBROW_NEON
+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
+ asm volatile (
+ "vmov.u8 d4, #255 \n" // Alpha
+ ".p2align 2 \n"
+ "1: \n"
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc", "d1", "d2", "d3", "d4" // Clobber List
+ );
+}
+#endif // HAS_RAWTOARGBROW_NEON
+
+#ifdef HAS_ARGBTORGBAROW_NEON
+void ARGBToRGBARow_NEON(const uint8* src_argb, uint8* dst_rgba, int pix) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmov.u8 d0, d4 \n" // move A before RGB.
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of RGBA.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_rgba), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc", "d0", "d1", "d2", "d3", "d4" // Clobber List
+ );
+}
+#endif // HAS_ARGBTORGBAROW_NEON
+
+#ifdef HAS_ARGBTORGB24ROW_NEON
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc", "d1", "d2", "d3", "d4" // Clobber List
+ );
+}
+#endif // HAS_ARGBTORGB24ROW_NEON
+
+#ifdef HAS_ARGBTORAWROW_NEON
+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_raw), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc", "d1", "d2", "d3", "d4" // Clobber List
+ );
+}
+#endif // HAS_ARGBTORAWROW_NEON
+
+#ifdef HAS_YUY2TOYROW_NEON
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld2.u8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %2, %2, #16 \n" // 16 processed per loop.
+ "vst1.u8 {q0}, [%1]! \n" // store 16 pixels of Y.
+ "bgt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc", "q0", "q1" // Clobber List
+ );
+}
+#endif // HAS_YUY2TOYROW_NEON
+
+#ifdef HAS_UYVYTOYROW_NEON
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld2.u8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %2, %2, #16 \n" // 16 processed per loop.
+ "vst1.u8 {q1}, [%1]! \n" // store 16 pixels of Y.
+ "bgt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc", "q0", "q1" // Clobber List
+ );
+}
+#endif // HAS_UYVYTOYROW_NEON
+
+#ifdef HAS_YUY2TOYROW_NEON
+void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+ int pix) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
+ "vst1.u8 {d1}, [%1]! \n" // store 8 U.
+ "vst1.u8 {d3}, [%2]! \n" // store 8 V.
+ "bgt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "memory", "cc", "d0", "d1", "d2", "d3" // Clobber List
+ );
+}
+#endif // HAS_YUY2TOYROW_NEON
+
+#ifdef HAS_UYVYTOYROW_NEON
+void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
+ int pix) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
+ "vst1.u8 {d0}, [%1]! \n" // store 8 U.
+ "vst1.u8 {d2}, [%2]! \n" // store 8 V.
+ "bgt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "memory", "cc", "d0", "d1", "d2", "d3" // Clobber List
+ );
+}
+#endif // HAS_UYVYTOYROW_NEON
+
+#ifdef HAS_YUY2TOYROW_NEON
+void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "adds %1, %0, %1 \n" // stride + src_yuy2
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
+ "vrhadd.u8 d1, d1, d5 \n" // average rows of U
+ "vrhadd.u8 d3, d3, d7 \n" // average rows of V
+ "vst1.u8 {d1}, [%2]! \n" // store 8 U.
+ "vst1.u8 {d3}, [%3]! \n" // store 8 V.
+ "bgt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(stride_yuy2), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "memory", "cc", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
+ );
+}
+#endif // HAS_YUY2TOYROW_NEON
+
+#ifdef HAS_UYVYTOYROW_NEON
+void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "adds %1, %0, %1 \n" // stride + src_uyvy
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
+ "vrhadd.u8 d0, d0, d4 \n" // average rows of U
+ "vrhadd.u8 d2, d2, d6 \n" // average rows of V
+ "vst1.u8 {d0}, [%2]! \n" // store 8 U.
+ "vst1.u8 {d2}, [%3]! \n" // store 8 V.
+ "bgt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(stride_uyvy), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "memory", "cc", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
+ );
+}
+#endif // HAS_UYVYTOYROW_NEON
+
+#endif // __ARM_NEON__
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/row_posix.cc b/files/source/row_posix.cc
index 88ce475b..33149dad 100644
--- a/files/source/row_posix.cc
+++ b/files/source/row_posix.cc
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@@ -8,652 +8,3655 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "row.h"
+#include "libyuv/row.h"
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
extern "C" {
+#endif
+
+// This module is for GCC x86 and x64
+#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
+
+// GCC 4.2 on OSX has link error when passing static or const to inline.
+// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
+#ifdef __APPLE__
+#define CONST
+#else
+#define CONST static const
+#endif
#ifdef HAS_ARGBTOYROW_SSSE3
-// Constant multiplication table for converting ARGB to I400.
-extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400[16]) = {
- 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u
+// Constants for ARGB
+CONST vec8 kARGBToY = {
+ 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
+};
+
+CONST vec8 kARGBToU = {
+ 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
+};
+
+CONST vec8 kARGBToV = {
+ -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
+};
+
+// Constants for BGRA
+CONST vec8 kBGRAToY = {
+ 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
+};
+
+CONST vec8 kBGRAToU = {
+ 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
+};
+
+CONST vec8 kBGRAToV = {
+ 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
+};
+
+// Constants for ABGR
+CONST vec8 kABGRToY = {
+ 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
+};
+
+CONST vec8 kABGRToU = {
+ -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
+};
+
+CONST vec8 kABGRToV = {
+ 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
+};
+
+CONST uvec8 kAddY16 = {
+ 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
};
-extern "C" TALIGN16(const uint8, kAdd16[16]) = {
- 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u
+CONST uvec8 kAddUV128 = {
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
};
-// Shuffle table for converting BG24 to ARGB.
-extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
+// Shuffle table for converting RGB24 to ARGB.
+CONST uvec8 kShuffleMaskRGB24ToARGB = {
0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
};
// Shuffle table for converting RAW to ARGB.
-extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
+CONST uvec8 kShuffleMaskRAWToARGB = {
2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
};
-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
- asm volatile(
- "movdqa (%3),%%xmm7\n"
- "movdqa (%4),%%xmm6\n"
- "movdqa %%xmm6,%%xmm5\n"
- "psllw $0x4,%%xmm5\n" // Generate a mask of 0x10 on each byte.
-"1:"
- "movdqa (%0),%%xmm0\n"
- "pmaddubsw %%xmm7,%%xmm0\n"
- "movdqa 0x10(%0),%%xmm1\n"
- "psrlw $0x7,%%xmm0\n"
- "pmaddubsw %%xmm7,%%xmm1\n"
- "lea 0x20(%0),%0\n"
- "psrlw $0x7,%%xmm1\n"
- "packuswb %%xmm1,%%xmm0\n"
- "pmaddubsw %%xmm6,%%xmm0\n"
- "packuswb %%xmm0,%%xmm0\n"
- "paddb %%xmm5,%%xmm0\n"
- "movq %%xmm0,(%1)\n"
- "lea 0x8(%1),%1\n"
- "sub $0x8,%2\n"
- "ja 1b\n"
- : "+r"(src_argb), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- : "r"(kMultiplyMaskARGBToI400), // %3
- "r"(kAdd16) // %4
- : "memory"
-);
-}
-#endif
-
-#ifdef HAS_BG24TOARGBROW_SSSE3
-void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
- asm volatile(
- "pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000
- "pslld $0x18,%%xmm7\n"
- "movdqa (%3),%%xmm6\n"
-"1:"
- "movdqa (%0),%%xmm0\n"
- "movdqa 0x10(%0),%%xmm1\n"
- "movdqa 0x20(%0),%%xmm3\n"
- "lea 0x30(%0),%0\n"
- "movdqa %%xmm3,%%xmm2\n"
- "palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
- "pshufb %%xmm6,%%xmm2\n"
- "por %%xmm7,%%xmm2\n"
- "palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
- "pshufb %%xmm6,%%xmm0\n"
- "movdqa %%xmm2,0x20(%1)\n"
- "por %%xmm7,%%xmm0\n"
- "pshufb %%xmm6,%%xmm1\n"
- "movdqa %%xmm0,(%1)\n"
- "por %%xmm7,%%xmm1\n"
- "palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
- "pshufb %%xmm6,%%xmm3\n"
- "movdqa %%xmm1,0x10(%1)\n"
- "por %%xmm7,%%xmm3\n"
- "movdqa %%xmm3,0x30(%1)\n"
- "lea 0x40(%1),%1\n"
- "sub $0x10,%2\n"
- "ja 1b\n"
- : "+r"(src_bg24), // %0
+// Shuffle table for converting ABGR to ARGB.
+CONST uvec8 kShuffleMaskABGRToARGB = {
+ 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
+};
+
+// Shuffle table for converting BGRA to ARGB.
+CONST uvec8 kShuffleMaskBGRAToARGB = {
+ 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
+};
+
+// Shuffle table for converting RGBA to ARGB.
+CONST uvec8 kShuffleMaskRGBAToARGB = {
+ 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
+};
+
+// Shuffle table for converting ARGB to RGBA.
+CONST uvec8 kShuffleMaskARGBToRGBA = {
+ 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
+};
+
+// Shuffle table for converting ARGB to RGB24.
+CONST uvec8 kShuffleMaskARGBToRGB24 = {
+ 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGB to RAW.
+CONST uvec8 kShuffleMaskARGBToRAW = {
+ 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
+};
+
+void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movq (%0),%%xmm0 \n"
+ "lea 0x8(%0),%0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm0,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm1 \n"
+ "por %%xmm5,%%xmm0 \n"
+ "por %%xmm5,%%xmm1 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "movdqa %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
- : "r"(kShuffleMaskBG24ToARGB) // %3
- : "memory"
-);
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+
+void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
+ asm volatile (
+ "movdqa %3,%%xmm5 \n"
+ "sub %0,%1 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0,(%0,%1,1) \n"
+ "lea 0x10(%0),%0 \n"
+ "jg 1b \n"
+
+ : "+r"(src_abgr), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ : "m"(kShuffleMaskABGRToARGB) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm5"
+#endif
+ );
+}
+
+void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
+ asm volatile (
+ "movdqa %3,%%xmm5 \n"
+ "sub %0,%1 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0,(%0,%1,1) \n"
+ "lea 0x10(%0),%0 \n"
+ "jg 1b \n"
+ : "+r"(src_bgra), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ : "m"(kShuffleMaskBGRAToARGB) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm5"
+#endif
+ );
+}
+
+void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) {
+ asm volatile (
+ "movdqa %3,%%xmm5 \n"
+ "sub %0,%1 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0,(%0,%1,1) \n"
+ "lea 0x10(%0),%0 \n"
+ "jg 1b \n"
+
+ : "+r"(src_rgba), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ : "m"(kShuffleMaskRGBAToARGB) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm5"
+#endif
+ );
+}
+
+void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) {
+ asm volatile (
+ "movdqa %3,%%xmm5 \n"
+ "sub %0,%1 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0,(%0,%1,1) \n"
+ "lea 0x10(%0),%0 \n"
+ "jg 1b \n"
+
+ : "+r"(src_argb), // %0
+ "+r"(dst_rgba), // %1
+ "+r"(pix) // %2
+ : "m"(kShuffleMaskARGBToRGBA) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm5"
+#endif
+ );
+}
+
+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
+ "pslld $0x18,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm3 \n"
+ "lea 0x30(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "palignr $0x8,%%xmm1,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqa %%xmm2,0x20(%1) \n"
+ "por %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "palignr $0x4,%%xmm3,%%xmm3 \n"
+ "pshufb %%xmm4,%%xmm3 \n"
+ "movdqa %%xmm1,0x10(%1) \n"
+ "por %%xmm5,%%xmm3 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm3,0x30(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ : "m"(kShuffleMaskRGB24ToARGB) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
}
void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
- asm volatile(
- "pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000
- "pslld $0x18,%%xmm7\n"
- "movdqa (%3),%%xmm6\n"
-"1:"
- "movdqa (%0),%%xmm0\n"
- "movdqa 0x10(%0),%%xmm1\n"
- "movdqa 0x20(%0),%%xmm3\n"
- "lea 0x30(%0),%0\n"
- "movdqa %%xmm3,%%xmm2\n"
- "palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
- "pshufb %%xmm6,%%xmm2\n"
- "por %%xmm7,%%xmm2\n"
- "palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
- "pshufb %%xmm6,%%xmm0\n"
- "movdqa %%xmm2,0x20(%1)\n"
- "por %%xmm7,%%xmm0\n"
- "pshufb %%xmm6,%%xmm1\n"
- "movdqa %%xmm0,(%1)\n"
- "por %%xmm7,%%xmm1\n"
- "palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
- "pshufb %%xmm6,%%xmm3\n"
- "movdqa %%xmm1,0x10(%1)\n"
- "por %%xmm7,%%xmm3\n"
- "movdqa %%xmm3,0x30(%1)\n"
- "lea 0x40(%1),%1\n"
- "sub $0x10,%2\n"
- "ja 1b\n"
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
+ "pslld $0x18,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm3 \n"
+ "lea 0x30(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "palignr $0x8,%%xmm1,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqa %%xmm2,0x20(%1) \n"
+ "por %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "palignr $0x4,%%xmm3,%%xmm3 \n"
+ "pshufb %%xmm4,%%xmm3 \n"
+ "movdqa %%xmm1,0x10(%1) \n"
+ "por %%xmm5,%%xmm3 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm3,0x30(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "jg 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
- : "r"(kShuffleMaskRAWToARGB) // %3
- : "memory"
-);
+ : "m"(kShuffleMaskRAWToARGB) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
}
+
+void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
+ asm volatile (
+ "mov $0x1080108,%%eax \n"
+ "movd %%eax,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "mov $0x20802080,%%eax \n"
+ "movd %%eax,%%xmm6 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psllw $0xb,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psllw $0xa,%%xmm4 \n"
+ "psrlw $0x5,%%xmm4 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psllw $0x8,%%xmm7 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "psllw $0xb,%%xmm2 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm2 \n"
+ "psllw $0x8,%%xmm1 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "pmulhuw %%xmm6,%%xmm0 \n"
+ "por %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,(%1,%0,2) \n"
+ "movdqa %%xmm2,0x10(%1,%0,2) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc", "eax"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
#endif
+ );
+}
-#if defined(__x86_64__)
+void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
+ asm volatile (
+ "mov $0x1080108,%%eax \n"
+ "movd %%eax,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "mov $0x42004200,%%eax \n"
+ "movd %%eax,%%xmm6 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psllw $0xb,%%xmm3 \n"
+ "movdqa %%xmm3,%%xmm4 \n"
+ "psrlw $0x6,%%xmm4 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psllw $0x8,%%xmm7 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psllw $0x1,%%xmm1 \n"
+ "psllw $0xb,%%xmm2 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm2 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "psllw $0x8,%%xmm1 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "pmulhuw %%xmm6,%%xmm0 \n"
+ "pand %%xmm7,%%xmm2 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,(%1,%0,2) \n"
+ "movdqa %%xmm2,0x10(%1,%0,2) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc", "eax"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
-// 64 bit linux gcc version
-
-void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi
- const uint8* u_buf, // rsi
- const uint8* v_buf, // rdx
- uint8* rgb_buf, // rcx
- int width) { // r8
- asm volatile(
-"1:"
- "movzb (%1),%%r10\n"
- "lea 1(%1),%1\n"
- "movzb (%2),%%r11\n"
- "lea 1(%2),%2\n"
- "movq 2048(%5,%%r10,8),%%xmm0\n"
- "movzb (%0),%%r10\n"
- "movq 4096(%5,%%r11,8),%%xmm1\n"
- "movzb 0x1(%0),%%r11\n"
- "paddsw %%xmm1,%%xmm0\n"
- "movq (%5,%%r10,8),%%xmm2\n"
- "lea 2(%0),%0\n"
- "movq (%5,%%r11,8),%%xmm3\n"
- "paddsw %%xmm0,%%xmm2\n"
- "paddsw %%xmm0,%%xmm3\n"
- "shufps $0x44,%%xmm3,%%xmm2\n"
- "psraw $0x6,%%xmm2\n"
- "packuswb %%xmm2,%%xmm2\n"
- "movq %%xmm2,0x0(%3)\n"
- "lea 8(%3),%3\n"
- "sub $0x2,%4\n"
- "ja 1b\n"
- : "+r"(y_buf), // %0
- "+r"(u_buf), // %1
- "+r"(v_buf), // %2
- "+r"(rgb_buf), // %3
- "+r"(width) // %4
- : "r" (_kCoefficientsRgbY) // %5
- : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
-);
-}
-
-void FastConvertYUVToBGRARow(const uint8* y_buf, // rdi
- const uint8* u_buf, // rsi
- const uint8* v_buf, // rdx
- uint8* rgb_buf, // rcx
- int width) { // r8
- asm volatile(
-"1:"
- "movzb (%1),%%r10\n"
- "lea 1(%1),%1\n"
- "movzb (%2),%%r11\n"
- "lea 1(%2),%2\n"
- "movq 2048(%5,%%r10,8),%%xmm0\n"
- "movzb (%0),%%r10\n"
- "movq 4096(%5,%%r11,8),%%xmm1\n"
- "movzb 0x1(%0),%%r11\n"
- "paddsw %%xmm1,%%xmm0\n"
- "movq (%5,%%r10,8),%%xmm2\n"
- "lea 2(%0),%0\n"
- "movq (%5,%%r11,8),%%xmm3\n"
- "paddsw %%xmm0,%%xmm2\n"
- "paddsw %%xmm0,%%xmm3\n"
- "shufps $0x44,%%xmm3,%%xmm2\n"
- "psraw $0x6,%%xmm2\n"
- "packuswb %%xmm2,%%xmm2\n"
- "movq %%xmm2,0x0(%3)\n"
- "lea 8(%3),%3\n"
- "sub $0x2,%4\n"
- "ja 1b\n"
- : "+r"(y_buf), // %0
- "+r"(u_buf), // %1
- "+r"(v_buf), // %2
- "+r"(rgb_buf), // %3
- "+r"(width) // %4
- : "r" (_kCoefficientsBgraY) // %5
- : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
-);
-}
-
-void FastConvertYUVToABGRRow(const uint8* y_buf, // rdi
- const uint8* u_buf, // rsi
- const uint8* v_buf, // rdx
- uint8* rgb_buf, // rcx
- int width) { // r8
- asm volatile(
-"1:"
- "movzb (%1),%%r10\n"
- "lea 1(%1),%1\n"
- "movzb (%2),%%r11\n"
- "lea 1(%2),%2\n"
- "movq 2048(%5,%%r10,8),%%xmm0\n"
- "movzb (%0),%%r10\n"
- "movq 4096(%5,%%r11,8),%%xmm1\n"
- "movzb 0x1(%0),%%r11\n"
- "paddsw %%xmm1,%%xmm0\n"
- "movq (%5,%%r10,8),%%xmm2\n"
- "lea 2(%0),%0\n"
- "movq (%5,%%r11,8),%%xmm3\n"
- "paddsw %%xmm0,%%xmm2\n"
- "paddsw %%xmm0,%%xmm3\n"
- "shufps $0x44,%%xmm3,%%xmm2\n"
- "psraw $0x6,%%xmm2\n"
- "packuswb %%xmm2,%%xmm2\n"
- "movq %%xmm2,0x0(%3)\n"
- "lea 8(%3),%3\n"
- "sub $0x2,%4\n"
- "ja 1b\n"
- : "+r"(y_buf), // %0
- "+r"(u_buf), // %1
- "+r"(v_buf), // %2
- "+r"(rgb_buf), // %3
- "+r"(width) // %4
- : "r" (_kCoefficientsAbgrY) // %5
- : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
-);
-}
-
-void FastConvertYUV444ToRGB32Row(const uint8* y_buf, // rdi
- const uint8* u_buf, // rsi
- const uint8* v_buf, // rdx
- uint8* rgb_buf, // rcx
- int width) { // r8
- asm volatile(
-"1:"
- "movzb (%1),%%r10\n"
- "lea 1(%1),%1\n"
- "movzb (%2),%%r11\n"
- "lea 1(%2),%2\n"
- "movq 2048(%5,%%r10,8),%%xmm0\n"
- "movzb (%0),%%r10\n"
- "movq 4096(%5,%%r11,8),%%xmm1\n"
- "paddsw %%xmm1,%%xmm0\n"
- "movq (%5,%%r10,8),%%xmm2\n"
- "lea 1(%0),%0\n"
- "paddsw %%xmm0,%%xmm2\n"
- "shufps $0x44,%%xmm2,%%xmm2\n"
- "psraw $0x6,%%xmm2\n"
- "packuswb %%xmm2,%%xmm2\n"
- "movd %%xmm2,0x0(%3)\n"
- "lea 4(%3),%3\n"
- "sub $0x1,%4\n"
- "ja 1b\n"
- : "+r"(y_buf), // %0
- "+r"(u_buf), // %1
- "+r"(v_buf), // %2
- "+r"(rgb_buf), // %3
- "+r"(width) // %4
- : "r" (_kCoefficientsRgbY) // %5
- : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2"
-);
-}
-
-void FastConvertYToRGB32Row(const uint8* y_buf, // rdi
- uint8* rgb_buf, // rcx
- int width) { // r8
- asm volatile(
-"1:"
- "movzb (%0),%%r10\n"
- "movzb 0x1(%0),%%r11\n"
- "movq (%3,%%r10,8),%%xmm2\n"
- "lea 2(%0),%0\n"
- "movq (%3,%%r11,8),%%xmm3\n"
- "shufps $0x44,%%xmm3,%%xmm2\n"
- "psraw $0x6,%%xmm2\n"
- "packuswb %%xmm2,%%xmm2\n"
- "movq %%xmm2,0x0(%1)\n"
- "lea 8(%1),%1\n"
- "sub $0x2,%2\n"
- "ja 1b\n"
+void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
+ asm volatile (
+ "mov $0xf0f0f0f,%%eax \n"
+ "movd %%eax,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "pslld $0x4,%%xmm5 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "psllw $0x4,%%xmm1 \n"
+ "psrlw $0x4,%%xmm3 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,(%1,%0,2) \n"
+ "movdqa %%xmm1,0x10(%1,%0,2) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc", "eax"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
+ asm volatile (
+ "movdqa %3,%%xmm6 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa 0x20(%0),%%xmm2 \n"
+ "movdqa 0x30(%0),%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "pshufb %%xmm6,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm6,%%xmm2 \n"
+ "pshufb %%xmm6,%%xmm3 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "psrldq $0x4,%%xmm1 \n"
+ "pslldq $0xc,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm5 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pslldq $0x8,%%xmm5 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "psrldq $0x8,%%xmm2 \n"
+ "pslldq $0x4,%%xmm3 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqa %%xmm1,0x10(%1) \n"
+ "movdqa %%xmm2,0x20(%1) \n"
+ "lea 0x30(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(pix) // %2
+ : "m"(kShuffleMaskARGBToRGB24) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+ );
+}
+
+void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
+ asm volatile (
+ "movdqa %3,%%xmm6 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa 0x20(%0),%%xmm2 \n"
+ "movdqa 0x30(%0),%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "pshufb %%xmm6,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm6,%%xmm2 \n"
+ "pshufb %%xmm6,%%xmm3 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "psrldq $0x4,%%xmm1 \n"
+ "pslldq $0xc,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm5 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pslldq $0x8,%%xmm5 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "psrldq $0x8,%%xmm2 \n"
+ "pslldq $0x4,%%xmm3 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqa %%xmm1,0x10(%1) \n"
+ "movdqa %%xmm2,0x20(%1) \n"
+ "lea 0x30(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(pix) // %2
+ : "m"(kShuffleMaskARGBToRAW) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+ );
+}
+
+void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psrld $0x1b,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrld $0x1a,%%xmm4 \n"
+ "pslld $0x5,%%xmm4 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0xb,%%xmm5 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pslld $0x8,%%xmm0 \n"
+ "psrld $0x3,%%xmm1 \n"
+ "psrld $0x5,%%xmm2 \n"
+ "psrad $0x10,%%xmm0 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "pand %%xmm4,%%xmm2 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrld $0x1b,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "pslld $0x5,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "pslld $0xa,%%xmm6 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "pslld $0xf,%%xmm7 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "psrad $0x10,%%xmm0 \n"
+ "psrld $0x3,%%xmm1 \n"
+ "psrld $0x6,%%xmm2 \n"
+ "psrld $0x9,%%xmm3 \n"
+ "pand %%xmm7,%%xmm0 \n"
+ "pand %%xmm4,%%xmm1 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "pand %%xmm6,%%xmm3 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psllw $0xc,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm3 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm3,%%xmm0 \n"
+ "pand %%xmm4,%%xmm1 \n"
+ "psrlq $0x4,%%xmm0 \n"
+ "psrlq $0x8,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+#endif
+ );
+}
+
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ asm volatile (
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa 0x20(%0),%%xmm2 \n"
+ "movdqa 0x30(%0),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ : "m"(kARGBToY), // %3
+ "m"(kAddY16) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ asm volatile (
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ : "m"(kARGBToY), // %3
+ "m"(kAddY16) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+// TODO(fbarchard): pass xmm constants to single block of assembly.
+// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
+// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
+// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
+// and considered unsafe.
+void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kARGBToU), // %0
+ "m"(kARGBToV), // %1
+ "m"(kAddUV128) // %2
+ );
+ asm volatile (
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa 0x20(%0),%%xmm2 \n"
+ "movdqa 0x30(%0),%%xmm6 \n"
+ "pavgb (%0,%4,1),%%xmm0 \n"
+ "pavgb 0x10(%0,%4,1),%%xmm1 \n"
+ "pavgb 0x20(%0,%4,1),%%xmm2 \n"
+ "pavgb 0x30(%0,%4,1),%%xmm6 \n"
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"(static_cast<intptr_t>(src_stride_argb))
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kARGBToU), // %0
+ "m"(kARGBToV), // %1
+ "m"(kAddUV128) // %2
+ );
+ asm volatile (
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu (%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"(static_cast<intptr_t>(src_stride_argb))
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
+ asm volatile (
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa 0x20(%0),%%xmm2 \n"
+ "movdqa 0x30(%0),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_bgra), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ : "m"(kBGRAToY), // %3
+ "m"(kAddY16) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
+ asm volatile (
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_bgra), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ : "m"(kBGRAToY), // %3
+ "m"(kAddY16) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kBGRAToU), // %0
+ "m"(kBGRAToV), // %1
+ "m"(kAddUV128) // %2
+ );
+ asm volatile (
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa 0x20(%0),%%xmm2 \n"
+ "movdqa 0x30(%0),%%xmm6 \n"
+ "pavgb (%0,%4,1),%%xmm0 \n"
+ "pavgb 0x10(%0,%4,1),%%xmm1 \n"
+ "pavgb 0x20(%0,%4,1),%%xmm2 \n"
+ "pavgb 0x30(%0,%4,1),%%xmm6 \n"
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_bgra0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"(static_cast<intptr_t>(src_stride_bgra))
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kBGRAToU), // %0
+ "m"(kBGRAToV), // %1
+ "m"(kAddUV128) // %2
+ );
+ asm volatile (
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu (%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_bgra0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"(static_cast<intptr_t>(src_stride_bgra))
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
+ asm volatile (
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa 0x20(%0),%%xmm2 \n"
+ "movdqa 0x30(%0),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ : "m"(kABGRToY), // %3
+ "m"(kAddY16) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
+ asm volatile (
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ : "m"(kABGRToY), // %3
+ "m"(kAddY16) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kABGRToU), // %0
+ "m"(kABGRToV), // %1
+ "m"(kAddUV128) // %2
+ );
+ asm volatile (
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa 0x20(%0),%%xmm2 \n"
+ "movdqa 0x30(%0),%%xmm6 \n"
+ "pavgb (%0,%4,1),%%xmm0 \n"
+ "pavgb 0x10(%0,%4,1),%%xmm1 \n"
+ "pavgb 0x20(%0,%4,1),%%xmm2 \n"
+ "pavgb 0x30(%0,%4,1),%%xmm6 \n"
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_abgr0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"(static_cast<intptr_t>(src_stride_abgr))
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kABGRToU), // %0
+ "m"(kABGRToV), // %1
+ "m"(kAddUV128) // %2
+ );
+ asm volatile (
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu (%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_abgr0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"(static_cast<intptr_t>(src_stride_abgr))
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+ );
+}
+#endif // HAS_ARGBTOYROW_SSSE3
+
+#ifdef HAS_I422TOARGBROW_SSSE3
+#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
+#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
+#define UR 0
+
+#define VB 0
+#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
+#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
+
+// Bias
+#define BB UB * 128 + VB * 128
+#define BG UG * 128 + VG * 128
+#define BR UR * 128 + VR * 128
+
+#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
+
+struct {
+ vec8 kUVToB; // 0
+ vec8 kUVToG; // 16
+ vec8 kUVToR; // 32
+ vec16 kUVBiasB; // 48
+ vec16 kUVBiasG; // 64
+ vec16 kUVBiasR; // 80
+ vec16 kYSub16; // 96
+ vec16 kYToRgb; // 112
+ vec8 kVUToB; // 128
+ vec8 kVUToG; // 144
+ vec8 kVUToR; // 160
+} CONST SIMD_ALIGNED(kYuvConstants) = {
+ { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
+ { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
+ { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
+ { BB, BB, BB, BB, BB, BB, BB, BB },
+ { BG, BG, BG, BG, BG, BG, BG, BG },
+ { BR, BR, BR, BR, BR, BR, BR, BR },
+ { 16, 16, 16, 16, 16, 16, 16, 16 },
+ { YG, YG, YG, YG, YG, YG, YG, YG },
+ { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
+ { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
+ { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
+};
+
+
+// Read 8 UV from 411
+#define READYUV444 \
+ "movq (%[u_buf]),%%xmm0 \n" \
+ "movq (%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x8(%[u_buf]),%[u_buf] \n" \
+ "punpcklbw %%xmm1,%%xmm0 \n" \
+
+// Read 4 UV from 422, upsample to 8 UV
+#define READYUV422 \
+ "movd (%[u_buf]),%%xmm0 \n" \
+ "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x4(%[u_buf]),%[u_buf] \n" \
+ "punpcklbw %%xmm1,%%xmm0 \n" \
+ "punpcklwd %%xmm0,%%xmm0 \n" \
+
+// Read 2 UV from 411, upsample to 8 UV
+#define READYUV411 \
+ "movd (%[u_buf]),%%xmm0 \n" \
+ "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x2(%[u_buf]),%[u_buf] \n" \
+ "punpcklbw %%xmm1,%%xmm0 \n" \
+ "punpcklwd %%xmm0,%%xmm0 \n" \
+ "punpckldq %%xmm0,%%xmm0 \n" \
+
+// Read 4 UV from NV12, upsample to 8 UV
+#define READNV12 \
+ "movq (%[uv_buf]),%%xmm0 \n" \
+ "lea 0x8(%[uv_buf]),%[uv_buf] \n" \
+ "punpcklwd %%xmm0,%%xmm0 \n" \
+
+// Convert 8 pixels: 8 UV and 8 Y
+#define YUVTORGB \
+ "movdqa %%xmm0,%%xmm1 \n" \
+ "movdqa %%xmm0,%%xmm2 \n" \
+ "pmaddubsw (%[kYuvConstants]),%%xmm0 \n" \
+ "pmaddubsw 16(%[kYuvConstants]),%%xmm1 \n" \
+ "pmaddubsw 32(%[kYuvConstants]),%%xmm2 \n" \
+ "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
+ "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
+ "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
+ "movq (%[y_buf]),%%xmm3 \n" \
+ "lea 0x8(%[y_buf]),%[y_buf] \n" \
+ "punpcklbw %%xmm4,%%xmm3 \n" \
+ "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
+ "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
+ "paddsw %%xmm3,%%xmm0 \n" \
+ "paddsw %%xmm3,%%xmm1 \n" \
+ "paddsw %%xmm3,%%xmm2 \n" \
+ "psraw $0x6,%%xmm0 \n" \
+ "psraw $0x6,%%xmm1 \n" \
+ "psraw $0x6,%%xmm2 \n" \
+ "packuswb %%xmm0,%%xmm0 \n" \
+ "packuswb %%xmm1,%%xmm1 \n" \
+ "packuswb %%xmm2,%%xmm2 \n" \
+
+// Convert 8 pixels: 8 VU and 8 Y
+#define YVUTORGB \
+ "movdqa %%xmm0,%%xmm1 \n" \
+ "movdqa %%xmm0,%%xmm2 \n" \
+ "pmaddubsw 128(%[kYuvConstants]),%%xmm0 \n" \
+ "pmaddubsw 144(%[kYuvConstants]),%%xmm1 \n" \
+ "pmaddubsw 160(%[kYuvConstants]),%%xmm2 \n" \
+ "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
+ "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
+ "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
+ "movq (%[y_buf]),%%xmm3 \n" \
+ "lea 0x8(%[y_buf]),%[y_buf] \n" \
+ "punpcklbw %%xmm4,%%xmm3 \n" \
+ "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
+ "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
+ "paddsw %%xmm3,%%xmm0 \n" \
+ "paddsw %%xmm3,%%xmm1 \n" \
+ "paddsw %%xmm3,%%xmm2 \n" \
+ "psraw $0x6,%%xmm0 \n" \
+ "psraw $0x6,%%xmm1 \n" \
+ "psraw $0x6,%%xmm2 \n" \
+ "packuswb %%xmm0,%%xmm0 \n" \
+ "packuswb %%xmm1,%%xmm1 \n" \
+ "packuswb %%xmm2,%%xmm2 \n" \
+
+void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* argb_buf,
+ int width) {
+ asm volatile (
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ READYUV444
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,(%[argb_buf]) \n"
+ "movdqa %%xmm1,0x10(%[argb_buf]) \n"
+ "lea 0x20(%[argb_buf]),%[argb_buf] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [argb_buf]"+r"(argb_buf), // %[argb_buf]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* argb_buf,
+ int width) {
+ asm volatile (
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ READYUV422
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,(%[argb_buf]) \n"
+ "movdqa %%xmm1,0x10(%[argb_buf]) \n"
+ "lea 0x20(%[argb_buf]),%[argb_buf] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [argb_buf]"+r"(argb_buf), // %[argb_buf]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* argb_buf,
+ int width) {
+ asm volatile (
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ READYUV411
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,(%[argb_buf]) \n"
+ "movdqa %%xmm1,0x10(%[argb_buf]) \n"
+ "lea 0x20(%[argb_buf]),%[argb_buf] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [argb_buf]"+r"(argb_buf), // %[argb_buf]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* argb_buf,
+ int width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ READNV12
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,(%[argb_buf]) \n"
+ "movdqa %%xmm1,0x10(%[argb_buf]) \n"
+ "lea 0x20(%[argb_buf]),%[argb_buf] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [uv_buf]"+r"(uv_buf), // %[uv_buf]
+ [argb_buf]"+r"(argb_buf), // %[argb_buf]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* vu_buf,
+ uint8* argb_buf,
+ int width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ READNV12
+ YVUTORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,(%[argb_buf]) \n"
+ "movdqa %%xmm1,0x10(%[argb_buf]) \n"
+ "lea 0x20(%[argb_buf]),%[argb_buf] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [uv_buf]"+r"(vu_buf), // %[uv_buf]
+ [argb_buf]"+r"(argb_buf), // %[argb_buf]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* argb_buf,
+ int width) {
+ asm volatile (
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ READYUV444
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0,(%[argb_buf]) \n"
+ "movdqu %%xmm1,0x10(%[argb_buf]) \n"
+ "lea 0x20(%[argb_buf]),%[argb_buf] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [argb_buf]"+r"(argb_buf), // %[argb_buf]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* argb_buf,
+ int width) {
+ asm volatile (
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ READYUV422
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0,(%[argb_buf]) \n"
+ "movdqu %%xmm1,0x10(%[argb_buf]) \n"
+ "lea 0x20(%[argb_buf]),%[argb_buf] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [argb_buf]"+r"(argb_buf), // %[argb_buf]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* argb_buf,
+ int width) {
+ asm volatile (
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ READYUV411
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0,(%[argb_buf]) \n"
+ "movdqu %%xmm1,0x10(%[argb_buf]) \n"
+ "lea 0x20(%[argb_buf]),%[argb_buf] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [argb_buf]"+r"(argb_buf), // %[argb_buf]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* argb_buf,
+ int width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ READNV12
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0,(%[argb_buf]) \n"
+ "movdqu %%xmm1,0x10(%[argb_buf]) \n"
+ "lea 0x20(%[argb_buf]),%[argb_buf] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [uv_buf]"+r"(uv_buf), // %[uv_buf]
+ [argb_buf]"+r"(argb_buf), // %[argb_buf]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* vu_buf,
+ uint8* argb_buf,
+ int width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ READNV12
+ YVUTORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0,(%[argb_buf]) \n"
+ "movdqu %%xmm1,0x10(%[argb_buf]) \n"
+ "lea 0x20(%[argb_buf]),%[argb_buf] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [uv_buf]"+r"(vu_buf), // %[uv_buf]
+ [argb_buf]"+r"(argb_buf), // %[argb_buf]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* bgra_buf,
+ int width) {
+ asm volatile (
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ READYUV422
+ YUVTORGB
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm5 \n"
+ "movdqa %%xmm5,%%xmm0 \n"
+ "punpcklwd %%xmm1,%%xmm5 \n"
+ "punpckhwd %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm5,(%[argb_buf]) \n"
+ "movdqa %%xmm0,0x10(%[argb_buf]) \n"
+ "lea 0x20(%[argb_buf]),%[argb_buf] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [argb_buf]"+r"(bgra_buf), // %[argb_buf]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* abgr_buf,
+ int width) {
+ asm volatile (
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ READYUV422
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "punpcklwd %%xmm0,%%xmm2 \n"
+ "punpckhwd %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,(%[argb_buf]) \n"
+ "movdqa %%xmm1,0x10(%[argb_buf]) \n"
+ "lea 0x20(%[argb_buf]),%[argb_buf] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [argb_buf]"+r"(abgr_buf), // %[argb_buf]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* bgra_buf,
+ int width) {
+ asm volatile (
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ READYUV422
+ YUVTORGB
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm5 \n"
+ "movdqa %%xmm5,%%xmm0 \n"
+ "punpcklwd %%xmm1,%%xmm5 \n"
+ "punpckhwd %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm5,(%[argb_buf]) \n"
+ "movdqu %%xmm0,0x10(%[argb_buf]) \n"
+ "lea 0x20(%[argb_buf]),%[argb_buf] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [argb_buf]"+r"(bgra_buf), // %[argb_buf]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* abgr_buf,
+ int width) {
+ asm volatile (
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ READYUV422
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "punpcklwd %%xmm0,%%xmm2 \n"
+ "punpckhwd %%xmm0,%%xmm1 \n"
+ "movdqu %%xmm2,(%[argb_buf]) \n"
+ "movdqu %%xmm1,0x10(%[argb_buf]) \n"
+ "lea 0x20(%[argb_buf]),%[argb_buf] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [argb_buf]"+r"(abgr_buf), // %[argb_buf]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+#endif // HAS_I422TOARGBROW_SSSE3
+
+#ifdef HAS_YTOARGBROW_SSE2
+void YToARGBRow_SSE2(const uint8* y_buf,
+ uint8* rgb_buf,
+ int width) {
+ asm volatile (
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "pslld $0x18,%%xmm4 \n"
+ "mov $0x10001000,%%eax \n"
+ "movd %%eax,%%xmm3 \n"
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "mov $0x012a012a,%%eax \n"
+ "movd %%eax,%%xmm2 \n"
+ "pshufd $0x0,%%xmm2,%%xmm2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+ "movq (%0),%%xmm0 \n"
+ "lea 0x8(%0),%0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "psubusw %%xmm3,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+
+ // Step 2: Weave into ARGB
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm0,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "por %%xmm4,%%xmm1 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "movdqa %%xmm1,16(%1) \n"
+ "lea 32(%1),%1 \n"
+
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(y_buf), // %0
"+r"(rgb_buf), // %1
- "+r"(width) // %2
- : "r" (_kCoefficientsRgbY) // %3
- : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
-);
-}
-
-#elif defined(__i386__)
-// 32 bit gcc version
-
-void FastConvertYUVToRGB32Row(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width);
- asm(
- ".text\n"
-#if defined(OSX) || defined(IOS)
- ".globl _FastConvertYUVToRGB32Row\n"
-"_FastConvertYUVToRGB32Row:\n"
-#else
- ".global FastConvertYUVToRGB32Row\n"
-"FastConvertYUVToRGB32Row:\n"
-#endif
- "pusha\n"
- "mov 0x24(%esp),%edx\n"
- "mov 0x28(%esp),%edi\n"
- "mov 0x2c(%esp),%esi\n"
- "mov 0x30(%esp),%ebp\n"
- "mov 0x34(%esp),%ecx\n"
-
-"1:"
- "movzbl (%edi),%eax\n"
- "lea 1(%edi),%edi\n"
- "movzbl (%esi),%ebx\n"
- "lea 1(%esi),%esi\n"
- "movq _kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
- "movzbl (%edx),%eax\n"
- "paddsw _kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
- "movzbl 0x1(%edx),%ebx\n"
- "movq _kCoefficientsRgbY(,%eax,8),%mm1\n"
- "lea 2(%edx),%edx\n"
- "movq _kCoefficientsRgbY(,%ebx,8),%mm2\n"
- "paddsw %mm0,%mm1\n"
- "paddsw %mm0,%mm2\n"
- "psraw $0x6,%mm1\n"
- "psraw $0x6,%mm2\n"
- "packuswb %mm2,%mm1\n"
- "movntq %mm1,0x0(%ebp)\n"
- "lea 8(%ebp),%ebp\n"
- "sub $0x2,%ecx\n"
- "ja 1b\n"
- "popa\n"
- "ret\n"
-);
-
-void FastConvertYUVToBGRARow(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width);
- asm(
- ".text\n"
-#if defined(OSX) || defined(IOS)
- ".globl _FastConvertYUVToBGRARow\n"
-"_FastConvertYUVToBGRARow:\n"
-#else
- ".global FastConvertYUVToBGRARow\n"
-"FastConvertYUVToBGRARow:\n"
-#endif
- "pusha\n"
- "mov 0x24(%esp),%edx\n"
- "mov 0x28(%esp),%edi\n"
- "mov 0x2c(%esp),%esi\n"
- "mov 0x30(%esp),%ebp\n"
- "mov 0x34(%esp),%ecx\n"
-
-"1:"
- "movzbl (%edi),%eax\n"
- "lea 1(%edi),%edi\n"
- "movzbl (%esi),%ebx\n"
- "lea 1(%esi),%esi\n"
- "movq _kCoefficientsBgraY+2048(,%eax,8),%mm0\n"
- "movzbl (%edx),%eax\n"
- "paddsw _kCoefficientsBgraY+4096(,%ebx,8),%mm0\n"
- "movzbl 0x1(%edx),%ebx\n"
- "movq _kCoefficientsBgraY(,%eax,8),%mm1\n"
- "lea 2(%edx),%edx\n"
- "movq _kCoefficientsBgraY(,%ebx,8),%mm2\n"
- "paddsw %mm0,%mm1\n"
- "paddsw %mm0,%mm2\n"
- "psraw $0x6,%mm1\n"
- "psraw $0x6,%mm2\n"
- "packuswb %mm2,%mm1\n"
- "movntq %mm1,0x0(%ebp)\n"
- "lea 8(%ebp),%ebp\n"
- "sub $0x2,%ecx\n"
- "ja 1b\n"
- "popa\n"
- "ret\n"
-);
-
-void FastConvertYUVToABGRRow(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width);
- asm(
- ".text\n"
-#if defined(OSX) || defined(IOS)
- ".globl _FastConvertYUVToABGRRow\n"
-"_FastConvertYUVToABGRRow:\n"
-#else
- ".global FastConvertYUVToABGRRow\n"
-"FastConvertYUVToABGRRow:\n"
-#endif
- "pusha\n"
- "mov 0x24(%esp),%edx\n"
- "mov 0x28(%esp),%edi\n"
- "mov 0x2c(%esp),%esi\n"
- "mov 0x30(%esp),%ebp\n"
- "mov 0x34(%esp),%ecx\n"
-
-"1:"
- "movzbl (%edi),%eax\n"
- "lea 1(%edi),%edi\n"
- "movzbl (%esi),%ebx\n"
- "lea 1(%esi),%esi\n"
- "movq _kCoefficientsAbgrY+2048(,%eax,8),%mm0\n"
- "movzbl (%edx),%eax\n"
- "paddsw _kCoefficientsAbgrY+4096(,%ebx,8),%mm0\n"
- "movzbl 0x1(%edx),%ebx\n"
- "movq _kCoefficientsAbgrY(,%eax,8),%mm1\n"
- "lea 2(%edx),%edx\n"
- "movq _kCoefficientsAbgrY(,%ebx,8),%mm2\n"
- "paddsw %mm0,%mm1\n"
- "paddsw %mm0,%mm2\n"
- "psraw $0x6,%mm1\n"
- "psraw $0x6,%mm2\n"
- "packuswb %mm2,%mm1\n"
- "movntq %mm1,0x0(%ebp)\n"
- "lea 8(%ebp),%ebp\n"
- "sub $0x2,%ecx\n"
- "ja 1b\n"
- "popa\n"
- "ret\n"
-);
-
-void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width);
- asm(
- ".text\n"
-#if defined(OSX) || defined(IOS)
- ".globl _FastConvertYUV444ToRGB32Row\n"
-"_FastConvertYUV444ToRGB32Row:\n"
-#else
- ".global FastConvertYUV444ToRGB32Row\n"
-"FastConvertYUV444ToRGB32Row:\n"
-#endif
- "pusha\n"
- "mov 0x24(%esp),%edx\n"
- "mov 0x28(%esp),%edi\n"
- "mov 0x2c(%esp),%esi\n"
- "mov 0x30(%esp),%ebp\n"
- "mov 0x34(%esp),%ecx\n"
-
-"1:"
- "movzbl (%edi),%eax\n"
- "lea 1(%edi),%edi\n"
- "movzbl (%esi),%ebx\n"
- "lea 1(%esi),%esi\n"
- "movq _kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
- "movzbl (%edx),%eax\n"
- "paddsw _kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
- "lea 1(%edx),%edx\n"
- "paddsw _kCoefficientsRgbY(,%eax,8),%mm0\n"
- "psraw $0x6,%mm0\n"
- "packuswb %mm0,%mm0\n"
- "movd %mm0,0x0(%ebp)\n"
- "lea 4(%ebp),%ebp\n"
- "sub $0x1,%ecx\n"
- "ja 1b\n"
- "popa\n"
- "ret\n"
-);
-
-void FastConvertYToRGB32Row(const uint8* y_buf,
- uint8* rgb_buf,
- int width);
- asm(
- ".text\n"
-#if defined(OSX) || defined(IOS)
- ".globl _FastConvertYToRGB32Row\n"
-"_FastConvertYToRGB32Row:\n"
-#else
- ".global FastConvertYToRGB32Row\n"
-"FastConvertYToRGB32Row:\n"
-#endif
- "push %ebx\n"
- "mov 0x8(%esp),%eax\n"
- "mov 0xc(%esp),%edx\n"
- "mov 0x10(%esp),%ecx\n"
-
-"1:"
- "movzbl (%eax),%ebx\n"
- "movq _kCoefficientsRgbY(,%ebx,8),%mm0\n"
- "psraw $0x6,%mm0\n"
- "movzbl 0x1(%eax),%ebx\n"
- "movq _kCoefficientsRgbY(,%ebx,8),%mm1\n"
- "psraw $0x6,%mm1\n"
- "packuswb %mm1,%mm0\n"
- "lea 0x2(%eax),%eax\n"
- "movq %mm0,(%edx)\n"
- "lea 0x8(%edx),%edx\n"
- "sub $0x2,%ecx\n"
- "ja 1b\n"
- "pop %ebx\n"
- "ret\n"
-);
+ "+rm"(width) // %2
+ :
+ : "memory", "cc", "eax"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+#endif
+ );
+}
+#endif // HAS_YTOARGBROW_SSE2
-#else
-// C reference code that mimic the YUV assembly.
-#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
-#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
- (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
-
-static inline void YuvPixel(uint8 y,
- uint8 u,
- uint8 v,
- uint8* rgb_buf,
- int ashift,
- int rshift,
- int gshift,
- int bshift) {
-
- int b = _kCoefficientsRgbY[256+u][0];
- int g = _kCoefficientsRgbY[256+u][1];
- int r = _kCoefficientsRgbY[256+u][2];
- int a = _kCoefficientsRgbY[256+u][3];
-
- b = paddsw(b, _kCoefficientsRgbY[512+v][0]);
- g = paddsw(g, _kCoefficientsRgbY[512+v][1]);
- r = paddsw(r, _kCoefficientsRgbY[512+v][2]);
- a = paddsw(a, _kCoefficientsRgbY[512+v][3]);
-
- b = paddsw(b, _kCoefficientsRgbY[y][0]);
- g = paddsw(g, _kCoefficientsRgbY[y][1]);
- r = paddsw(r, _kCoefficientsRgbY[y][2]);
- a = paddsw(a, _kCoefficientsRgbY[y][3]);
-
- b >>= 6;
- g >>= 6;
- r >>= 6;
- a >>= 6;
-
- *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b) << bshift) |
- (packuswb(g) << gshift) |
- (packuswb(r) << rshift) |
- (packuswb(a) << ashift);
-}
-
-void FastConvertYUVToRGB32Row(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) {
- for (int x = 0; x < width; x += 2) {
- uint8 u = u_buf[x >> 1];
- uint8 v = v_buf[x >> 1];
- uint8 y0 = y_buf[x];
- YuvPixel(y0, u, v, rgb_buf, 24, 16, 8, 0);
- if ((x + 1) < width) {
- uint8 y1 = y_buf[x + 1];
- YuvPixel(y1, u, v, rgb_buf + 4, 24, 16, 8, 0);
- }
- rgb_buf += 8; // Advance 2 pixels.
- }
+#ifdef HAS_MIRRORROW_SSSE3
+// Shuffle table for reversing the bytes.
+CONST uvec8 kShuffleMirror = {
+ 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+
+void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+ intptr_t temp_width = static_cast<intptr_t>(width);
+ asm volatile (
+ "movdqa %3,%%xmm5 \n"
+ "lea -0x10(%0),%0 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0,%2),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(temp_width) // %2
+ : "m"(kShuffleMirror) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm5"
+#endif
+ );
}
+#endif // HAS_MIRRORROW_SSSE3
-void FastConvertYUVToBGRARow(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) {
- for (int x = 0; x < width; x += 2) {
- uint8 u = u_buf[x >> 1];
- uint8 v = v_buf[x >> 1];
- uint8 y0 = y_buf[x];
- YuvPixel(y0, u, v, rgb_buf, 0, 8, 16, 24);
- if ((x + 1) < width) {
- uint8 y1 = y_buf[x + 1];
- YuvPixel(y1, u, v, rgb_buf + 4, 0, 8, 16, 24);
- }
- rgb_buf += 8; // Advance 2 pixels.
+#ifdef HAS_MIRRORROW_SSE2
+void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+ intptr_t temp_width = static_cast<intptr_t>(width);
+ asm volatile (
+ "lea -0x10(%0),%0 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0,%2),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "psllw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
+ "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
+ "pshufd $0x4e,%%xmm0,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(temp_width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+#endif // HAS_MIRRORROW_SSE2
+
+#ifdef HAS_MIRRORROW_UV_SSSE3
+// Shuffle table for reversing the bytes of UV channels.
+CONST uvec8 kShuffleMirrorUV = {
+ 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
+};
+void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
+ int width) {
+ intptr_t temp_width = static_cast<intptr_t>(width);
+ asm volatile (
+ "movdqa %4,%%xmm1 \n"
+ "lea -16(%0,%3,2),%0 \n"
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "lea -16(%0),%0 \n"
+ "pshufb %%xmm1,%%xmm0 \n"
+ "sub $8,%3 \n"
+ "movlpd %%xmm0,(%1) \n"
+ "movhpd %%xmm0,(%1,%2) \n"
+ "lea 8(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(temp_width) // %3
+ : "m"(kShuffleMirrorUV) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+#endif // HAS_MIRRORROW_UV_SSSE3
+
+#ifdef HAS_ARGBMIRRORROW_SSSE3
+// Shuffle table for reversing the bytes.
+CONST uvec8 kARGBShuffleMirror = {
+ 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
+};
+
+void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+ intptr_t temp_width = static_cast<intptr_t>(width);
+ asm volatile (
+ "movdqa %3,%%xmm5 \n"
+ "lea -0x10(%0),%0 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0,%2,4),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(temp_width) // %2
+ : "m"(kARGBShuffleMirror) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm5"
+#endif
+ );
+}
+#endif // HAS_ARGBMIRRORROW_SSSE3
+
+#ifdef HAS_SPLITUV_SSE2
+void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "movdqa %%xmm2,(%1,%2) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+#endif // HAS_SPLITUV_SSE2
+
+#ifdef HAS_COPYROW_SSE2
+void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
+ asm volatile (
+ "sub %0,%1 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa %%xmm0,(%0,%1) \n"
+ "movdqa %%xmm1,0x10(%0,%1) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(count) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+#endif // HAS_COPYROW_SSE2
+
+#ifdef HAS_COPYROW_X86
+void CopyRow_X86(const uint8* src, uint8* dst, int width) {
+ size_t width_tmp = static_cast<size_t>(width);
+ asm volatile (
+ "shr $0x2,%2 \n"
+ "rep movsl \n"
+ : "+S"(src), // %0
+ "+D"(dst), // %1
+ "+c"(width_tmp) // %2
+ :
+ : "memory", "cc"
+ );
+}
+#endif // HAS_COPYROW_X86
+
+#ifdef HAS_SETROW_X86
+void SetRow8_X86(uint8* dst, uint32 v32, int width) {
+ size_t width_tmp = static_cast<size_t>(width);
+ asm volatile (
+ "shr $0x2,%1 \n"
+ "rep stosl \n"
+ : "+D"(dst), // %0
+ "+c"(width_tmp) // %1
+ : "a"(v32) // %2
+ : "memory", "cc");
+}
+
+void SetRows32_X86(uint8* dst, uint32 v32, int width,
+ int dst_stride, int height) {
+ for (int y = 0; y < height; ++y) {
+ size_t width_tmp = static_cast<size_t>(width);
+ uint32* d = reinterpret_cast<uint32*>(dst);
+ asm volatile (
+ "rep stosl \n"
+ : "+D"(d), // %0
+ "+c"(width_tmp) // %1
+ : "a"(v32) // %2
+ : "memory", "cc");
+ dst += dst_stride;
}
}
+#endif // HAS_SETROW_X86
+
+#ifdef HAS_YUY2TOYROW_SSE2
+void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+
+void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa (%0,%4,1),%%xmm2 \n"
+ "movdqa 0x10(%0,%4,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,(%1,%2) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+
+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,(%1,%2) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+
+void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
+ uint8* dst_y, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+
+void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
+ int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu (%0,%4,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,(%1,%2) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+
+void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,(%1,%2) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+
+void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
+ asm volatile (
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+
+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa (%0,%4,1),%%xmm2 \n"
+ "movdqa 0x10(%0,%4,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,(%1,%2) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,(%1,%2) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+
+void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
+ uint8* dst_y, int pix) {
+ asm volatile (
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+
+void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu (%0,%4,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,(%1,%2) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+
+void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,(%1,%2) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+#endif // HAS_YUY2TOYROW_SSE2
+
+#ifdef HAS_ARGBBLENDROW_SSE2
+// Blend 8 pixels at a time.
+void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $0xf,%%xmm7 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x8,%%xmm6 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psllw $0x8,%%xmm5 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "pslld $0x18,%%xmm4 \n"
+ "sub $0x1,%3 \n"
+ "je 91f \n"
+ "jl 99f \n"
+
+ // 1 pixel loop until destination pointer is aligned.
+ "10: \n"
+ "test $0xf,%2 \n"
+ "je 19f \n"
+ "movd (%0),%%xmm3 \n"
+ "lea 0x4(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movd (%1),%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
+ "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movd (%1),%%xmm1 \n"
+ "lea 0x4(%1),%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "sub $0x1,%3 \n"
+ "movd %%xmm0,(%2) \n"
+ "lea 0x4(%2),%2 \n"
+ "jge 10b \n"
+
+ "19: \n"
+ "add $1-4,%3 \n"
+ "jl 49f \n"
-void FastConvertYUVToABGRRow(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
+ // 4 pixel loop.
+ ".p2align 2 \n"
+ "41: \n"
+ "movdqu (%0),%%xmm3 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movdqu (%1),%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
+ "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "lea 0x10(%1),%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqa %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "jge 41b \n"
+
+ "49: \n"
+ "add $0x3,%3 \n"
+ "jl 99f \n"
+
+ // 1 pixel loop.
+ "91: \n"
+ "movd (%0),%%xmm3 \n"
+ "lea 0x4(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movd (%1),%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
+ "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movd (%1),%%xmm1 \n"
+ "lea 0x4(%1),%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "sub $0x1,%3 \n"
+ "movd %%xmm0,(%2) \n"
+ "lea 0x4(%2),%2 \n"
+ "jge 91b \n"
+ "99: \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+#endif // HAS_ARGBBLENDROW_SSE2
+
+#ifdef HAS_ARGBBLENDROW_SSSE3
+// Shuffle table for isolating alpha.
+CONST uvec8 kShuffleAlpha = {
+ 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
+ 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
+};
+
+// Blend 8 pixels at a time
+// Shuffle table for reversing the bytes.
+
+// Same as SSE2, but replaces
+// psrlw xmm3, 8 // alpha
+// pshufhw xmm3, xmm3,0F5h // 8 alpha words
+// pshuflw xmm3, xmm3,0F5h
+// with..
+// pshufb xmm3, kShuffleAlpha // alpha
+
+void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $0xf,%%xmm7 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x8,%%xmm6 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psllw $0x8,%%xmm5 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "pslld $0x18,%%xmm4 \n"
+ "sub $0x1,%3 \n"
+ "je 91f \n"
+ "jl 99f \n"
+
+ // 1 pixel loop until destination pointer is aligned.
+ "10: \n"
+ "test $0xf,%2 \n"
+ "je 19f \n"
+ "movd (%0),%%xmm3 \n"
+ "lea 0x4(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movd (%1),%%xmm2 \n"
+ "pshufb %4,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movd (%1),%%xmm1 \n"
+ "lea 0x4(%1),%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "sub $0x1,%3 \n"
+ "movd %%xmm0,(%2) \n"
+ "lea 0x4(%2),%2 \n"
+ "jge 10b \n"
+
+ "19: \n"
+ "add $1-4,%3 \n"
+ "jl 49f \n"
+ "test $0xf,%0 \n"
+ "jne 41f \n"
+ "test $0xf,%1 \n"
+ "jne 41f \n"
+
+ // 4 pixel loop.
+ ".p2align 2 \n"
+ "40: \n"
+ "movdqa (%0),%%xmm3 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movdqa (%1),%%xmm2 \n"
+ "pshufb %4,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movdqa (%1),%%xmm1 \n"
+ "lea 0x10(%1),%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqa %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "jge 40b \n"
+ "jmp 49f \n"
+
+ // 4 pixel unaligned loop.
+ ".p2align 2 \n"
+ "41: \n"
+ "movdqu (%0),%%xmm3 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movdqu (%1),%%xmm2 \n"
+ "pshufb %4,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "lea 0x10(%1),%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqa %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "jge 41b \n"
+
+ "49: \n"
+ "add $0x3,%3 \n"
+ "jl 99f \n"
+
+ // 1 pixel loop.
+ "91: \n"
+ "movd (%0),%%xmm3 \n"
+ "lea 0x4(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movd (%1),%%xmm2 \n"
+ "pshufb %4,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movd (%1),%%xmm1 \n"
+ "lea 0x4(%1),%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "sub $0x1,%3 \n"
+ "movd %%xmm0,(%2) \n"
+ "lea 0x4(%2),%2 \n"
+ "jge 91b \n"
+ "99: \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ : "m"(kShuffleAlpha) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+#endif // HAS_ARGBBLENDROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATE_SSE2
+// Attenuate 4 pixels at a time.
+// aligned to 16 bytes
+void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "pslld $0x18,%%xmm4 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrld $0x8,%%xmm5 \n"
+
+ // 4 pixel loop.
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "pshufhw $0xff,%%xmm0,%%xmm2 \n"
+ "pshuflw $0xff,%%xmm2,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "movdqa (%0),%%xmm1 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "pshufhw $0xff,%%xmm1,%%xmm2 \n"
+ "pshuflw $0xff,%%xmm2,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "movdqa (%0),%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "pand %%xmm4,%%xmm2 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0,(%0,%1,1) \n"
+ "lea 0x10(%0),%0 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+#endif // HAS_ARGBATTENUATE_SSE2
+
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+// Shuffle table duplicating alpha
+CONST uvec8 kShuffleAlpha0 = {
+ 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
+};
+CONST uvec8 kShuffleAlpha1 = {
+ 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+ 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
+};
+// Attenuate 4 pixels at a time.
+// aligned to 16 bytes
+void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "pslld $0x18,%%xmm3 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+
+ // 4 pixel loop.
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqa (%0),%%xmm1 \n"
+ "punpcklbw %%xmm1,%%xmm1 \n"
+ "pmulhuw %%xmm1,%%xmm0 \n"
+ "movdqa (%0),%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "movdqa (%0),%%xmm2 \n"
+ "punpckhbw %%xmm2,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "movdqa (%0),%%xmm2 \n"
+ "pand %%xmm3,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0,(%0,%1,1) \n"
+ "lea 0x10(%0),%0 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleAlpha0), // %3
+ "m"(kShuffleAlpha1) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+#endif // HAS_ARGBATTENUATEROW_SSSE3
+
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+// Unattenuate 4 pixels at a time.
+// aligned to 16 bytes
+void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
int width) {
- for (int x = 0; x < width; x += 2) {
- uint8 u = u_buf[x >> 1];
- uint8 v = v_buf[x >> 1];
- uint8 y0 = y_buf[x];
- YuvPixel(y0, u, v, rgb_buf, 24, 0, 8, 16);
- if ((x + 1) < width) {
- uint8 y1 = y_buf[x + 1];
- YuvPixel(y1, u, v, rgb_buf + 4, 24, 0, 8, 16);
- }
- rgb_buf += 8; // Advance 2 pixels.
- }
+ uintptr_t alpha = 0;
+ asm volatile (
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "pslld $0x18,%%xmm4 \n"
+
+ // 4 pixel loop.
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movzb 0x3(%0),%3 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "movd 0x0(%4,%3,4),%%xmm2 \n"
+ "movzb 0x7(%0),%3 \n"
+ "movd 0x0(%4,%3,4),%%xmm3 \n"
+ "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
+ "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
+ "movlhps %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "movdqa (%0),%%xmm1 \n"
+ "movzb 0xb(%0),%3 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "movd 0x0(%4,%3,4),%%xmm2 \n"
+ "movzb 0xf(%0),%3 \n"
+ "movd 0x0(%4,%3,4),%%xmm3 \n"
+ "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
+ "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
+ "movlhps %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "movdqa (%0),%%xmm2 \n"
+ "pand %%xmm4,%%xmm2 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0,(%0,%1,1) \n"
+ "lea 0x10(%0),%0 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width), // %2
+ "+r"(alpha) // %3
+ : "r"(fixed_invtbl8) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
}
+#endif // HAS_ARGBUNATTENUATEROW_SSE2
-void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) {
- for (int x = 0; x < width; ++x) {
- uint8 u = u_buf[x];
- uint8 v = v_buf[x];
- uint8 y = y_buf[x];
- YuvPixel(y, u, v, rgb_buf, 24, 16, 8, 0);
- rgb_buf += 4; // Advance 1 pixel.
- }
+#ifdef HAS_ARGBGRAYROW_SSSE3
+// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
+CONST vec8 kARGBToGray = {
+ 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
+};
+
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+ asm volatile (
+ "movdqa %3,%%xmm4 \n"
+ "sub %0,%1 \n"
+
+ // 8 pixel loop.
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movdqa (%0),%%xmm2 \n"
+ "movdqa 0x10(%0),%%xmm3 \n"
+ "psrld $0x18,%%xmm2 \n"
+ "psrld $0x18,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpcklbw %%xmm2,%%xmm3 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm3,%%xmm0 \n"
+ "punpckhwd %%xmm3,%%xmm1 \n"
+ "sub $0x8,%2 \n"
+ "movdqa %%xmm0,(%0,%1,1) \n"
+ "movdqa %%xmm1,0x10(%0,%1,1) \n"
+ "lea 0x20(%0),%0 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kARGBToGray) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+#endif
+ );
}
+#endif // HAS_ARGBGRAYROW_SSSE3
-void FastConvertYToRGB32Row(const uint8* y_buf,
- uint8* rgb_buf,
- int width) {
- for (int x = 0; x < width; ++x) {
- uint8 y = y_buf[x];
- YuvPixel(y, 128, 128, rgb_buf, 24, 16, 8, 0);
- rgb_buf += 4; // Advance 1 pixel.
- }
+#ifdef HAS_ARGBSEPIAROW_SSSE3
+// b = (r * 35 + g * 68 + b * 17) >> 7
+// g = (r * 45 + g * 88 + b * 22) >> 7
+// r = (r * 50 + g * 98 + b * 24) >> 7
+// Constant for ARGB color to sepia tone
+CONST vec8 kARGBToSepiaB = {
+ 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
+};
+
+CONST vec8 kARGBToSepiaG = {
+ 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
+};
+
+CONST vec8 kARGBToSepiaR = {
+ 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
+};
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
+ asm volatile (
+ "movdqa %2,%%xmm2 \n"
+ "movdqa %3,%%xmm3 \n"
+ "movdqa %4,%%xmm4 \n"
+
+ // 8 pixel loop.
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm6 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "pmaddubsw %%xmm2,%%xmm6 \n"
+ "phaddw %%xmm6,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movdqa (%0),%%xmm5 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm5 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm5 \n"
+ "psrlw $0x7,%%xmm5 \n"
+ "packuswb %%xmm5,%%xmm5 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "movdqa (%0),%%xmm5 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm5 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm5 \n"
+ "psrlw $0x7,%%xmm5 \n"
+ "packuswb %%xmm5,%%xmm5 \n"
+ "movdqa (%0),%%xmm6 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "psrld $0x18,%%xmm6 \n"
+ "psrld $0x18,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm5 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm5,%%xmm0 \n"
+ "punpckhwd %%xmm5,%%xmm1 \n"
+ "sub $0x8,%1 \n"
+ "movdqa %%xmm0,(%0) \n"
+ "movdqa %%xmm1,0x10(%0) \n"
+ "lea 0x20(%0),%0 \n"
+ "jg 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ : "m"(kARGBToSepiaB), // %2
+ "m"(kARGBToSepiaG), // %3
+ "m"(kARGBToSepiaR) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+ );
+}
+#endif // HAS_ARGBSEPIAROW_SSSE3
+
+#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// Same as Sepia except matrix is provided.
+void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
+ int width) {
+ asm volatile (
+ "movd (%2),%%xmm2 \n"
+ "movd 0x4(%2),%%xmm3 \n"
+ "movd 0x8(%2),%%xmm4 \n"
+ "pshufd $0x0,%%xmm2,%%xmm2 \n"
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+
+ // 8 pixel loop.
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm6 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "pmaddubsw %%xmm2,%%xmm6 \n"
+ "movdqa (%0),%%xmm5 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm5 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "phaddsw %%xmm6,%%xmm0 \n"
+ "phaddsw %%xmm1,%%xmm5 \n"
+ "psraw $0x7,%%xmm0 \n"
+ "psraw $0x7,%%xmm5 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm5,%%xmm5 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "movdqa (%0),%%xmm5 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm5 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "phaddsw %%xmm1,%%xmm5 \n"
+ "psraw $0x7,%%xmm5 \n"
+ "packuswb %%xmm5,%%xmm5 \n"
+ "movdqa (%0),%%xmm6 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "psrld $0x18,%%xmm6 \n"
+ "psrld $0x18,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm6,%%xmm5 \n"
+ "punpcklwd %%xmm5,%%xmm0 \n"
+ "punpckhwd %%xmm5,%%xmm1 \n"
+ "sub $0x8,%1 \n"
+ "movdqa %%xmm0,(%0) \n"
+ "movdqa %%xmm1,0x10(%0) \n"
+ "lea 0x20(%0),%0 \n"
+ "jg 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ : "r"(matrix_argb) // %2
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+ );
}
+#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
+#ifdef HAS_ARGBQUANTIZEROW_SSE2
+// Quantize 4 ARGB pixels (16 bytes).
+// aligned to 16 bytes
+void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
+ int interval_offset, int width) {
+ asm volatile (
+ "movd %2,%%xmm2 \n"
+ "movd %3,%%xmm3 \n"
+ "movd %4,%%xmm4 \n"
+ "pshuflw $0x40,%%xmm2,%%xmm2 \n"
+ "pshufd $0x44,%%xmm2,%%xmm2 \n"
+ "pshuflw $0x40,%%xmm3,%%xmm3 \n"
+ "pshufd $0x44,%%xmm3,%%xmm3 \n"
+ "pshuflw $0x40,%%xmm4,%%xmm4 \n"
+ "pshufd $0x44,%%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "pslld $0x18,%%xmm6 \n"
+
+ // 4 pixel loop.
+ ".p2align 2 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "movdqa (%0),%%xmm1 \n"
+ "punpckhbw %%xmm5,%%xmm1 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "pmullw %%xmm3,%%xmm0 \n"
+ "movdqa (%0),%%xmm7 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "pand %%xmm6,%%xmm7 \n"
+ "paddw %%xmm4,%%xmm0 \n"
+ "paddw %%xmm4,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "por %%xmm7,%%xmm0 \n"
+ "sub $0x4,%1 \n"
+ "movdqa %%xmm0,(%0) \n"
+ "lea 0x10(%0),%0 \n"
+ "jg 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ : "r"(scale), // %2
+ "r"(interval_size), // %3
+ "r"(interval_offset) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
#endif
+ );
+}
+#endif // HAS_ARGBQUANTIZEROW_SSE2
+
+#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
+// Creates a table of cumulative sums where each value is a sum of all values
+// above and to the left of the value, inclusive of the value.
+void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
+ const int32* previous_cumsum, int width) {
+ asm volatile (
+ "sub %1,%2 \n"
+ "pxor %%xmm0,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "sub $0x4,%3 \n"
+ "jl 49f \n"
+ "test $0xf,%1 \n"
+ "jne 49f \n"
+
+ // 4 pixel loop \n"
+ ".p2align 2 \n"
+ "40: \n"
+ "movdqu (%0),%%xmm2 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm2,%%xmm4 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm1,%%xmm2 \n"
+ "punpckhwd %%xmm1,%%xmm3 \n"
+ "punpckhbw %%xmm1,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "punpcklwd %%xmm1,%%xmm4 \n"
+ "punpckhwd %%xmm1,%%xmm5 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "movdqa (%1,%2,1),%%xmm2 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "movdqa 0x10(%1,%2,1),%%xmm3 \n"
+ "paddd %%xmm0,%%xmm3 \n"
+ "paddd %%xmm4,%%xmm0 \n"
+ "movdqa 0x20(%1,%2,1),%%xmm4 \n"
+ "paddd %%xmm0,%%xmm4 \n"
+ "paddd %%xmm5,%%xmm0 \n"
+ "movdqa 0x30(%1,%2,1),%%xmm5 \n"
+ "paddd %%xmm0,%%xmm5 \n"
+ "movdqa %%xmm2,(%1) \n"
+ "movdqa %%xmm3,0x10(%1) \n"
+ "movdqa %%xmm4,0x20(%1) \n"
+ "movdqa %%xmm5,0x30(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x4,%3 \n"
+ "jge 40b \n"
+
+ "49: \n"
+ "add $0x3,%3 \n"
+ "jl 19f \n"
+
+ // 1 pixel loop \n"
+ ".p2align 2 \n"
+ "10: \n"
+ "movd (%0),%%xmm2 \n"
+ "lea 0x4(%0),%0 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "punpcklwd %%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "movdqu (%1,%2,1),%%xmm2 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm2,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x1,%3 \n"
+ "jge 10b \n"
+ "19: \n"
+ : "+r"(row), // %0
+ "+r"(cumsum), // %1
+ "+r"(previous_cumsum), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
+
+#ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
+void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
+ int width, int area, uint8* dst, int count) {
+ asm volatile (
+ "movd %5,%%xmm4 \n"
+ "cvtdq2ps %%xmm4,%%xmm4 \n"
+ "rcpss %%xmm4,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "sub $0x4,%3 \n"
+ "jl 49f \n"
+
+ // 4 pixel loop \n"
+ ".p2align 2 \n"
+ "40: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa 0x20(%0),%%xmm2 \n"
+ "movdqa 0x30(%0),%%xmm3 \n"
+ "psubd (%0,%4,4),%%xmm0 \n"
+ "psubd 0x10(%0,%4,4),%%xmm1 \n"
+ "psubd 0x20(%0,%4,4),%%xmm2 \n"
+ "psubd 0x30(%0,%4,4),%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "psubd (%1),%%xmm0 \n"
+ "psubd 0x10(%1),%%xmm1 \n"
+ "psubd 0x20(%1),%%xmm2 \n"
+ "psubd 0x30(%1),%%xmm3 \n"
+ "paddd (%1,%4,4),%%xmm0 \n"
+ "paddd 0x10(%1,%4,4),%%xmm1 \n"
+ "paddd 0x20(%1,%4,4),%%xmm2 \n"
+ "paddd 0x30(%1,%4,4),%%xmm3 \n"
+ "lea 0x40(%1),%1 \n"
+ "cvtdq2ps %%xmm0,%%xmm0 \n"
+ "cvtdq2ps %%xmm1,%%xmm1 \n"
+ "mulps %%xmm4,%%xmm0 \n"
+ "mulps %%xmm4,%%xmm1 \n"
+ "cvtdq2ps %%xmm2,%%xmm2 \n"
+ "cvtdq2ps %%xmm3,%%xmm3 \n"
+ "mulps %%xmm4,%%xmm2 \n"
+ "mulps %%xmm4,%%xmm3 \n"
+ "cvtps2dq %%xmm0,%%xmm0 \n"
+ "cvtps2dq %%xmm1,%%xmm1 \n"
+ "cvtps2dq %%xmm2,%%xmm2 \n"
+ "cvtps2dq %%xmm3,%%xmm3 \n"
+ "packssdw %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm3,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jge 40b \n"
+
+ "49: \n"
+ "add $0x3,%3 \n"
+ "jl 19f \n"
+
+ // 1 pixel loop \n"
+ ".p2align 2 \n"
+ "10: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "psubd (%0,%4,4),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "psubd (%1),%%xmm0 \n"
+ "paddd (%1,%4,4),%%xmm0 \n"
+ "lea 0x10(%1),%1 \n"
+ "cvtdq2ps %%xmm0,%%xmm0 \n"
+ "mulps %%xmm4,%%xmm0 \n"
+ "cvtps2dq %%xmm0,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0,(%2) \n"
+ "lea 0x4(%2),%2 \n"
+ "sub $0x1,%3 \n"
+ "jge 10b \n"
+ "19: \n"
+ : "+r"(topleft), // %0
+ "+r"(botleft), // %1
+ "+r"(dst), // %2
+ "+rm"(count) // %3
+ : "r"(static_cast<intptr_t>(width)), // %4
+ "rm"(area) // %5
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+#endif
+ );
+}
+#endif // HAS_CUMULATIVESUMTOAVERAGE_SSE2
+#ifdef HAS_ARGBSHADE_SSE2
+// Shade 4 pixels at a time by specified value.
+// Aligned to 16 bytes.
+void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
+ uint32 value) {
+ asm volatile (
+ "movd %3,%%xmm2 \n"
+ "sub %0,%1 \n"
+ "punpcklbw %%xmm2,%%xmm2 \n"
+ "punpcklqdq %%xmm2,%%xmm2 \n"
+
+ // 4 pixel loop.
+ ".p2align 2 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0,(%0,%1,1) \n"
+ "lea 0x10(%0),%0 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(value) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2"
+#endif
+ );
+}
+#endif // HAS_ARGBSHADE_SSE2
+
+#ifdef HAS_ARGBAFFINEROW_SSE2
+// TODO(fbarchard): Find 64 bit way to avoid masking.
+// TODO(fbarchard): Investigate why 4 pixels is slower than 2 on Core2.
+// Copy ARGB pixels from source image with slope to a row of destination.
+// Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing
+// an error if movq is used. movd %%xmm0,%1
+
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+ uint8* dst_argb, const float* uv_dudv, int width) {
+ intptr_t src_argb_stride_temp = src_argb_stride;
+ intptr_t temp = 0;
+ asm volatile (
+ "movq (%3),%%xmm2 \n"
+ "movq 0x8(%3),%%xmm7 \n"
+ "shl $0x10,%1 \n"
+ "add $0x4,%1 \n"
+ "movd %1,%%xmm5 \n"
+ "sub $0x4,%4 \n"
+ "jl 49f \n"
+
+ "pshufd $0x44,%%xmm7,%%xmm7 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "addps %%xmm7,%%xmm0 \n"
+ "movlhps %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm7,%%xmm4 \n"
+ "addps %%xmm4,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "addps %%xmm4,%%xmm3 \n"
+ "addps %%xmm4,%%xmm4 \n"
+
+ // 4 pixel loop \n"
+ ".p2align 4 \n"
+ "40: \n"
+ "cvttps2dq %%xmm2,%%xmm0 \n"
+ "cvttps2dq %%xmm3,%%xmm1 \n"
+ "packssdw %%xmm1,%%xmm0 \n"
+ "pmaddwd %%xmm5,%%xmm0 \n"
+#if defined(__x86_64__)
+ "movd %%xmm0,%1 \n"
+ "mov %1,%5 \n"
+ "and $0x0fffffff,%1 \n"
+ "shr $32,%5 \n"
+ "pshufd $0xEE,%%xmm0,%%xmm0 \n"
+#else
+ "movd %%xmm0,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%5 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+#endif
+ "movd (%0,%1,1),%%xmm1 \n"
+ "movd (%0,%5,1),%%xmm6 \n"
+ "punpckldq %%xmm6,%%xmm1 \n"
+ "addps %%xmm4,%%xmm2 \n"
+ "movq %%xmm1,(%2) \n"
+#if defined(__x86_64__)
+ "movd %%xmm0,%1 \n"
+ "mov %1,%5 \n"
+ "and $0x0fffffff,%1 \n"
+ "shr $32,%5 \n"
+#else
+ "movd %%xmm0,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%5 \n"
+#endif
+ "movd (%0,%1,1),%%xmm0 \n"
+ "movd (%0,%5,1),%%xmm6 \n"
+ "punpckldq %%xmm6,%%xmm0 \n"
+ "addps %%xmm4,%%xmm3 \n"
+ "sub $0x4,%4 \n"
+ "movq %%xmm0,0x08(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "jge 40b \n"
+
+ "49: \n"
+ "add $0x3,%4 \n"
+ "jl 19f \n"
+
+ // 1 pixel loop \n"
+ ".p2align 4 \n"
+ "10: \n"
+ "cvttps2dq %%xmm2,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "pmaddwd %%xmm5,%%xmm0 \n"
+ "addps %%xmm7,%%xmm2 \n"
+ "movd %%xmm0,%1 \n"
+#if defined(__x86_64__)
+ "and $0x0fffffff,%1 \n"
+#endif
+ "movd (%0,%1,1),%%xmm0 \n"
+ "sub $0x1,%4 \n"
+ "movd %%xmm0,(%2) \n"
+ "lea 0x4(%2),%2 \n"
+ "jge 10b \n"
+ "19: \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_argb_stride_temp), // %1
+ "+r"(dst_argb), // %2
+ "+r"(uv_dudv), // %3
+ "+rm"(width), // %4
+ "+r"(temp) // %5
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+#endif // HAS_ARGBAFFINEROW_SSE2
+
+// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
+void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
+ asm volatile (
+ "sub %1,%0 \n"
+ "shr %3 \n"
+ "cmp $0x0,%3 \n"
+ "je 2f \n"
+ "cmp $0x40,%3 \n"
+ "je 3f \n"
+ "movd %3,%%xmm0 \n"
+ "neg %3 \n"
+ "add $0x80,%3 \n"
+ "movd %3,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n"
+ "punpcklwd %%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%1),%%xmm0 \n"
+ "movdqa (%1,%4,1),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "pmaddubsw %%xmm5,%%xmm0 \n"
+ "pmaddubsw %%xmm5,%%xmm1 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ "jmp 4f \n"
+ ".p2align 4 \n"
+ "2: \n"
+ "movdqa (%1),%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 2b \n"
+ "jmp 4f \n"
+ ".p2align 4 \n"
+ "3: \n"
+ "movdqa (%1),%%xmm0 \n"
+ "pavgb (%1,%4,1),%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 3b \n"
+ "4: \n"
+ ".p2align 4 \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(source_y_fraction) // %3
+ : "r"(static_cast<intptr_t>(src_stride)) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm5"
+#endif
+ );
+}
+
+#endif // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/row_table.cc b/files/source/row_table.cc
deleted file mode 100644
index 022d9f88..00000000
--- a/files/source/row_table.cc
+++ /dev/null
@@ -1,469 +0,0 @@
-/*
- * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "row.h"
-
-#define kMaxStride (2048 * 4)
-
-extern "C" {
-
-#define MAKETABLE(NAME) \
-SIMD_ALIGNED(const int16 NAME[256 * 3][4]) = {\
- RGBY(0x00), RGBY(0x01), RGBY(0x02), RGBY(0x03), \
- RGBY(0x04), RGBY(0x05), RGBY(0x06), RGBY(0x07), \
- RGBY(0x08), RGBY(0x09), RGBY(0x0A), RGBY(0x0B), \
- RGBY(0x0C), RGBY(0x0D), RGBY(0x0E), RGBY(0x0F), \
- RGBY(0x10), RGBY(0x11), RGBY(0x12), RGBY(0x13), \
- RGBY(0x14), RGBY(0x15), RGBY(0x16), RGBY(0x17), \
- RGBY(0x18), RGBY(0x19), RGBY(0x1A), RGBY(0x1B), \
- RGBY(0x1C), RGBY(0x1D), RGBY(0x1E), RGBY(0x1F), \
- RGBY(0x20), RGBY(0x21), RGBY(0x22), RGBY(0x23), \
- RGBY(0x24), RGBY(0x25), RGBY(0x26), RGBY(0x27), \
- RGBY(0x28), RGBY(0x29), RGBY(0x2A), RGBY(0x2B), \
- RGBY(0x2C), RGBY(0x2D), RGBY(0x2E), RGBY(0x2F), \
- RGBY(0x30), RGBY(0x31), RGBY(0x32), RGBY(0x33), \
- RGBY(0x34), RGBY(0x35), RGBY(0x36), RGBY(0x37), \
- RGBY(0x38), RGBY(0x39), RGBY(0x3A), RGBY(0x3B), \
- RGBY(0x3C), RGBY(0x3D), RGBY(0x3E), RGBY(0x3F), \
- RGBY(0x40), RGBY(0x41), RGBY(0x42), RGBY(0x43), \
- RGBY(0x44), RGBY(0x45), RGBY(0x46), RGBY(0x47), \
- RGBY(0x48), RGBY(0x49), RGBY(0x4A), RGBY(0x4B), \
- RGBY(0x4C), RGBY(0x4D), RGBY(0x4E), RGBY(0x4F), \
- RGBY(0x50), RGBY(0x51), RGBY(0x52), RGBY(0x53), \
- RGBY(0x54), RGBY(0x55), RGBY(0x56), RGBY(0x57), \
- RGBY(0x58), RGBY(0x59), RGBY(0x5A), RGBY(0x5B), \
- RGBY(0x5C), RGBY(0x5D), RGBY(0x5E), RGBY(0x5F), \
- RGBY(0x60), RGBY(0x61), RGBY(0x62), RGBY(0x63), \
- RGBY(0x64), RGBY(0x65), RGBY(0x66), RGBY(0x67), \
- RGBY(0x68), RGBY(0x69), RGBY(0x6A), RGBY(0x6B), \
- RGBY(0x6C), RGBY(0x6D), RGBY(0x6E), RGBY(0x6F), \
- RGBY(0x70), RGBY(0x71), RGBY(0x72), RGBY(0x73), \
- RGBY(0x74), RGBY(0x75), RGBY(0x76), RGBY(0x77), \
- RGBY(0x78), RGBY(0x79), RGBY(0x7A), RGBY(0x7B), \
- RGBY(0x7C), RGBY(0x7D), RGBY(0x7E), RGBY(0x7F), \
- RGBY(0x80), RGBY(0x81), RGBY(0x82), RGBY(0x83), \
- RGBY(0x84), RGBY(0x85), RGBY(0x86), RGBY(0x87), \
- RGBY(0x88), RGBY(0x89), RGBY(0x8A), RGBY(0x8B), \
- RGBY(0x8C), RGBY(0x8D), RGBY(0x8E), RGBY(0x8F), \
- RGBY(0x90), RGBY(0x91), RGBY(0x92), RGBY(0x93), \
- RGBY(0x94), RGBY(0x95), RGBY(0x96), RGBY(0x97), \
- RGBY(0x98), RGBY(0x99), RGBY(0x9A), RGBY(0x9B), \
- RGBY(0x9C), RGBY(0x9D), RGBY(0x9E), RGBY(0x9F), \
- RGBY(0xA0), RGBY(0xA1), RGBY(0xA2), RGBY(0xA3), \
- RGBY(0xA4), RGBY(0xA5), RGBY(0xA6), RGBY(0xA7), \
- RGBY(0xA8), RGBY(0xA9), RGBY(0xAA), RGBY(0xAB), \
- RGBY(0xAC), RGBY(0xAD), RGBY(0xAE), RGBY(0xAF), \
- RGBY(0xB0), RGBY(0xB1), RGBY(0xB2), RGBY(0xB3), \
- RGBY(0xB4), RGBY(0xB5), RGBY(0xB6), RGBY(0xB7), \
- RGBY(0xB8), RGBY(0xB9), RGBY(0xBA), RGBY(0xBB), \
- RGBY(0xBC), RGBY(0xBD), RGBY(0xBE), RGBY(0xBF), \
- RGBY(0xC0), RGBY(0xC1), RGBY(0xC2), RGBY(0xC3), \
- RGBY(0xC4), RGBY(0xC5), RGBY(0xC6), RGBY(0xC7), \
- RGBY(0xC8), RGBY(0xC9), RGBY(0xCA), RGBY(0xCB), \
- RGBY(0xCC), RGBY(0xCD), RGBY(0xCE), RGBY(0xCF), \
- RGBY(0xD0), RGBY(0xD1), RGBY(0xD2), RGBY(0xD3), \
- RGBY(0xD4), RGBY(0xD5), RGBY(0xD6), RGBY(0xD7), \
- RGBY(0xD8), RGBY(0xD9), RGBY(0xDA), RGBY(0xDB), \
- RGBY(0xDC), RGBY(0xDD), RGBY(0xDE), RGBY(0xDF), \
- RGBY(0xE0), RGBY(0xE1), RGBY(0xE2), RGBY(0xE3), \
- RGBY(0xE4), RGBY(0xE5), RGBY(0xE6), RGBY(0xE7), \
- RGBY(0xE8), RGBY(0xE9), RGBY(0xEA), RGBY(0xEB), \
- RGBY(0xEC), RGBY(0xED), RGBY(0xEE), RGBY(0xEF), \
- RGBY(0xF0), RGBY(0xF1), RGBY(0xF2), RGBY(0xF3), \
- RGBY(0xF4), RGBY(0xF5), RGBY(0xF6), RGBY(0xF7), \
- RGBY(0xF8), RGBY(0xF9), RGBY(0xFA), RGBY(0xFB), \
- RGBY(0xFC), RGBY(0xFD), RGBY(0xFE), RGBY(0xFF), \
- RGBU(0x00), RGBU(0x01), RGBU(0x02), RGBU(0x03), \
- RGBU(0x04), RGBU(0x05), RGBU(0x06), RGBU(0x07), \
- RGBU(0x08), RGBU(0x09), RGBU(0x0A), RGBU(0x0B), \
- RGBU(0x0C), RGBU(0x0D), RGBU(0x0E), RGBU(0x0F), \
- RGBU(0x10), RGBU(0x11), RGBU(0x12), RGBU(0x13), \
- RGBU(0x14), RGBU(0x15), RGBU(0x16), RGBU(0x17), \
- RGBU(0x18), RGBU(0x19), RGBU(0x1A), RGBU(0x1B), \
- RGBU(0x1C), RGBU(0x1D), RGBU(0x1E), RGBU(0x1F), \
- RGBU(0x20), RGBU(0x21), RGBU(0x22), RGBU(0x23), \
- RGBU(0x24), RGBU(0x25), RGBU(0x26), RGBU(0x27), \
- RGBU(0x28), RGBU(0x29), RGBU(0x2A), RGBU(0x2B), \
- RGBU(0x2C), RGBU(0x2D), RGBU(0x2E), RGBU(0x2F), \
- RGBU(0x30), RGBU(0x31), RGBU(0x32), RGBU(0x33), \
- RGBU(0x34), RGBU(0x35), RGBU(0x36), RGBU(0x37), \
- RGBU(0x38), RGBU(0x39), RGBU(0x3A), RGBU(0x3B), \
- RGBU(0x3C), RGBU(0x3D), RGBU(0x3E), RGBU(0x3F), \
- RGBU(0x40), RGBU(0x41), RGBU(0x42), RGBU(0x43), \
- RGBU(0x44), RGBU(0x45), RGBU(0x46), RGBU(0x47), \
- RGBU(0x48), RGBU(0x49), RGBU(0x4A), RGBU(0x4B), \
- RGBU(0x4C), RGBU(0x4D), RGBU(0x4E), RGBU(0x4F), \
- RGBU(0x50), RGBU(0x51), RGBU(0x52), RGBU(0x53), \
- RGBU(0x54), RGBU(0x55), RGBU(0x56), RGBU(0x57), \
- RGBU(0x58), RGBU(0x59), RGBU(0x5A), RGBU(0x5B), \
- RGBU(0x5C), RGBU(0x5D), RGBU(0x5E), RGBU(0x5F), \
- RGBU(0x60), RGBU(0x61), RGBU(0x62), RGBU(0x63), \
- RGBU(0x64), RGBU(0x65), RGBU(0x66), RGBU(0x67), \
- RGBU(0x68), RGBU(0x69), RGBU(0x6A), RGBU(0x6B), \
- RGBU(0x6C), RGBU(0x6D), RGBU(0x6E), RGBU(0x6F), \
- RGBU(0x70), RGBU(0x71), RGBU(0x72), RGBU(0x73), \
- RGBU(0x74), RGBU(0x75), RGBU(0x76), RGBU(0x77), \
- RGBU(0x78), RGBU(0x79), RGBU(0x7A), RGBU(0x7B), \
- RGBU(0x7C), RGBU(0x7D), RGBU(0x7E), RGBU(0x7F), \
- RGBU(0x80), RGBU(0x81), RGBU(0x82), RGBU(0x83), \
- RGBU(0x84), RGBU(0x85), RGBU(0x86), RGBU(0x87), \
- RGBU(0x88), RGBU(0x89), RGBU(0x8A), RGBU(0x8B), \
- RGBU(0x8C), RGBU(0x8D), RGBU(0x8E), RGBU(0x8F), \
- RGBU(0x90), RGBU(0x91), RGBU(0x92), RGBU(0x93), \
- RGBU(0x94), RGBU(0x95), RGBU(0x96), RGBU(0x97), \
- RGBU(0x98), RGBU(0x99), RGBU(0x9A), RGBU(0x9B), \
- RGBU(0x9C), RGBU(0x9D), RGBU(0x9E), RGBU(0x9F), \
- RGBU(0xA0), RGBU(0xA1), RGBU(0xA2), RGBU(0xA3), \
- RGBU(0xA4), RGBU(0xA5), RGBU(0xA6), RGBU(0xA7), \
- RGBU(0xA8), RGBU(0xA9), RGBU(0xAA), RGBU(0xAB), \
- RGBU(0xAC), RGBU(0xAD), RGBU(0xAE), RGBU(0xAF), \
- RGBU(0xB0), RGBU(0xB1), RGBU(0xB2), RGBU(0xB3), \
- RGBU(0xB4), RGBU(0xB5), RGBU(0xB6), RGBU(0xB7), \
- RGBU(0xB8), RGBU(0xB9), RGBU(0xBA), RGBU(0xBB), \
- RGBU(0xBC), RGBU(0xBD), RGBU(0xBE), RGBU(0xBF), \
- RGBU(0xC0), RGBU(0xC1), RGBU(0xC2), RGBU(0xC3), \
- RGBU(0xC4), RGBU(0xC5), RGBU(0xC6), RGBU(0xC7), \
- RGBU(0xC8), RGBU(0xC9), RGBU(0xCA), RGBU(0xCB), \
- RGBU(0xCC), RGBU(0xCD), RGBU(0xCE), RGBU(0xCF), \
- RGBU(0xD0), RGBU(0xD1), RGBU(0xD2), RGBU(0xD3), \
- RGBU(0xD4), RGBU(0xD5), RGBU(0xD6), RGBU(0xD7), \
- RGBU(0xD8), RGBU(0xD9), RGBU(0xDA), RGBU(0xDB), \
- RGBU(0xDC), RGBU(0xDD), RGBU(0xDE), RGBU(0xDF), \
- RGBU(0xE0), RGBU(0xE1), RGBU(0xE2), RGBU(0xE3), \
- RGBU(0xE4), RGBU(0xE5), RGBU(0xE6), RGBU(0xE7), \
- RGBU(0xE8), RGBU(0xE9), RGBU(0xEA), RGBU(0xEB), \
- RGBU(0xEC), RGBU(0xED), RGBU(0xEE), RGBU(0xEF), \
- RGBU(0xF0), RGBU(0xF1), RGBU(0xF2), RGBU(0xF3), \
- RGBU(0xF4), RGBU(0xF5), RGBU(0xF6), RGBU(0xF7), \
- RGBU(0xF8), RGBU(0xF9), RGBU(0xFA), RGBU(0xFB), \
- RGBU(0xFC), RGBU(0xFD), RGBU(0xFE), RGBU(0xFF), \
- RGBV(0x00), RGBV(0x01), RGBV(0x02), RGBV(0x03), \
- RGBV(0x04), RGBV(0x05), RGBV(0x06), RGBV(0x07), \
- RGBV(0x08), RGBV(0x09), RGBV(0x0A), RGBV(0x0B), \
- RGBV(0x0C), RGBV(0x0D), RGBV(0x0E), RGBV(0x0F), \
- RGBV(0x10), RGBV(0x11), RGBV(0x12), RGBV(0x13), \
- RGBV(0x14), RGBV(0x15), RGBV(0x16), RGBV(0x17), \
- RGBV(0x18), RGBV(0x19), RGBV(0x1A), RGBV(0x1B), \
- RGBV(0x1C), RGBV(0x1D), RGBV(0x1E), RGBV(0x1F), \
- RGBV(0x20), RGBV(0x21), RGBV(0x22), RGBV(0x23), \
- RGBV(0x24), RGBV(0x25), RGBV(0x26), RGBV(0x27), \
- RGBV(0x28), RGBV(0x29), RGBV(0x2A), RGBV(0x2B), \
- RGBV(0x2C), RGBV(0x2D), RGBV(0x2E), RGBV(0x2F), \
- RGBV(0x30), RGBV(0x31), RGBV(0x32), RGBV(0x33), \
- RGBV(0x34), RGBV(0x35), RGBV(0x36), RGBV(0x37), \
- RGBV(0x38), RGBV(0x39), RGBV(0x3A), RGBV(0x3B), \
- RGBV(0x3C), RGBV(0x3D), RGBV(0x3E), RGBV(0x3F), \
- RGBV(0x40), RGBV(0x41), RGBV(0x42), RGBV(0x43), \
- RGBV(0x44), RGBV(0x45), RGBV(0x46), RGBV(0x47), \
- RGBV(0x48), RGBV(0x49), RGBV(0x4A), RGBV(0x4B), \
- RGBV(0x4C), RGBV(0x4D), RGBV(0x4E), RGBV(0x4F), \
- RGBV(0x50), RGBV(0x51), RGBV(0x52), RGBV(0x53), \
- RGBV(0x54), RGBV(0x55), RGBV(0x56), RGBV(0x57), \
- RGBV(0x58), RGBV(0x59), RGBV(0x5A), RGBV(0x5B), \
- RGBV(0x5C), RGBV(0x5D), RGBV(0x5E), RGBV(0x5F), \
- RGBV(0x60), RGBV(0x61), RGBV(0x62), RGBV(0x63), \
- RGBV(0x64), RGBV(0x65), RGBV(0x66), RGBV(0x67), \
- RGBV(0x68), RGBV(0x69), RGBV(0x6A), RGBV(0x6B), \
- RGBV(0x6C), RGBV(0x6D), RGBV(0x6E), RGBV(0x6F), \
- RGBV(0x70), RGBV(0x71), RGBV(0x72), RGBV(0x73), \
- RGBV(0x74), RGBV(0x75), RGBV(0x76), RGBV(0x77), \
- RGBV(0x78), RGBV(0x79), RGBV(0x7A), RGBV(0x7B), \
- RGBV(0x7C), RGBV(0x7D), RGBV(0x7E), RGBV(0x7F), \
- RGBV(0x80), RGBV(0x81), RGBV(0x82), RGBV(0x83), \
- RGBV(0x84), RGBV(0x85), RGBV(0x86), RGBV(0x87), \
- RGBV(0x88), RGBV(0x89), RGBV(0x8A), RGBV(0x8B), \
- RGBV(0x8C), RGBV(0x8D), RGBV(0x8E), RGBV(0x8F), \
- RGBV(0x90), RGBV(0x91), RGBV(0x92), RGBV(0x93), \
- RGBV(0x94), RGBV(0x95), RGBV(0x96), RGBV(0x97), \
- RGBV(0x98), RGBV(0x99), RGBV(0x9A), RGBV(0x9B), \
- RGBV(0x9C), RGBV(0x9D), RGBV(0x9E), RGBV(0x9F), \
- RGBV(0xA0), RGBV(0xA1), RGBV(0xA2), RGBV(0xA3), \
- RGBV(0xA4), RGBV(0xA5), RGBV(0xA6), RGBV(0xA7), \
- RGBV(0xA8), RGBV(0xA9), RGBV(0xAA), RGBV(0xAB), \
- RGBV(0xAC), RGBV(0xAD), RGBV(0xAE), RGBV(0xAF), \
- RGBV(0xB0), RGBV(0xB1), RGBV(0xB2), RGBV(0xB3), \
- RGBV(0xB4), RGBV(0xB5), RGBV(0xB6), RGBV(0xB7), \
- RGBV(0xB8), RGBV(0xB9), RGBV(0xBA), RGBV(0xBB), \
- RGBV(0xBC), RGBV(0xBD), RGBV(0xBE), RGBV(0xBF), \
- RGBV(0xC0), RGBV(0xC1), RGBV(0xC2), RGBV(0xC3), \
- RGBV(0xC4), RGBV(0xC5), RGBV(0xC6), RGBV(0xC7), \
- RGBV(0xC8), RGBV(0xC9), RGBV(0xCA), RGBV(0xCB), \
- RGBV(0xCC), RGBV(0xCD), RGBV(0xCE), RGBV(0xCF), \
- RGBV(0xD0), RGBV(0xD1), RGBV(0xD2), RGBV(0xD3), \
- RGBV(0xD4), RGBV(0xD5), RGBV(0xD6), RGBV(0xD7), \
- RGBV(0xD8), RGBV(0xD9), RGBV(0xDA), RGBV(0xDB), \
- RGBV(0xDC), RGBV(0xDD), RGBV(0xDE), RGBV(0xDF), \
- RGBV(0xE0), RGBV(0xE1), RGBV(0xE2), RGBV(0xE3), \
- RGBV(0xE4), RGBV(0xE5), RGBV(0xE6), RGBV(0xE7), \
- RGBV(0xE8), RGBV(0xE9), RGBV(0xEA), RGBV(0xEB), \
- RGBV(0xEC), RGBV(0xED), RGBV(0xEE), RGBV(0xEF), \
- RGBV(0xF0), RGBV(0xF1), RGBV(0xF2), RGBV(0xF3), \
- RGBV(0xF4), RGBV(0xF5), RGBV(0xF6), RGBV(0xF7), \
- RGBV(0xF8), RGBV(0xF9), RGBV(0xFA), RGBV(0xFB), \
- RGBV(0xFC), RGBV(0xFD), RGBV(0xFE), RGBV(0xFF), \
-};
-
-// ARGB table
-#define RGBY(i) { \
- static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
- static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
- static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
- static_cast<int16>(256 * 64 - 1) \
-}
-
-#define RGBU(i) { \
- static_cast<int16>(2.018 * 64 * (i - 128) + 0.5), \
- static_cast<int16>(-0.391 * 64 * (i - 128) + 0.5), \
- 0, \
- 0 \
-}
-
-#define RGBV(i) { \
- 0, \
- static_cast<int16>(-0.813 * 64 * (i - 128) + 0.5), \
- static_cast<int16>(1.596 * 64 * (i - 128) + 0.5), \
- 0 \
-}
-
-#ifdef OSX
-MAKETABLE(kCoefficientsRgbY)
-#else
-MAKETABLE(_kCoefficientsRgbY)
-#endif
-
-#undef RGBY
-#undef RGBU
-#undef RGBV
-
-// BGRA table
-#define RGBY(i) { \
- static_cast<int16>(256 * 64 - 1), \
- static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
- static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
- static_cast<int16>(1.164 * 64 * (i - 16) + 0.5) \
-}
-
-#define RGBU(i) { \
- 0, \
- 0, \
- static_cast<int16>(-0.391 * 64 * (i - 128) + 0.5), \
- static_cast<int16>(2.018 * 64 * (i - 128) + 0.5) \
-}
-
-#define RGBV(i) { \
- 0, \
- static_cast<int16>(1.596 * 64 * (i - 128) + 0.5), \
- static_cast<int16>(-0.813 * 64 * (i - 128) + 0.5), \
- 0 \
-}
-
-#ifdef OSX
-MAKETABLE(kCoefficientsBgraY)
-#else
-MAKETABLE(_kCoefficientsBgraY)
-#endif
-
-
-#undef RGBY
-#undef RGBU
-#undef RGBV
-
-// ABGR table
-#define RGBY(i) { \
- static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
- static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
- static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
- static_cast<int16>(256 * 64 - 1) \
-}
-
-#define RGBU(i) { \
- 0, \
- static_cast<int16>(-0.391 * 64 * (i - 128) + 0.5), \
- static_cast<int16>(2.018 * 64 * (i - 128) + 0.5), \
- 0 \
-}
-
-#define RGBV(i) { \
- static_cast<int16>(1.596 * 64 * (i - 128) + 0.5), \
- static_cast<int16>(-0.813 * 64 * (i - 128) + 0.5), \
- 0, \
- 0 \
-}
-
-#ifdef OSX
-MAKETABLE(kCoefficientsAbgrY)
-#else
-MAKETABLE(_kCoefficientsAbgrY)
-#endif
-
-
-void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) {
- for (int x = 0; x < pix; ++x) {
- uint8 r = src_raw[0];
- uint8 g = src_raw[1];
- uint8 b = src_raw[2];
- dst_argb[0] = b;
- dst_argb[1] = g;
- dst_argb[2] = r;
- dst_argb[3] = 255u;
- dst_argb += 4;
- src_raw += 3;
- }
-}
-
-void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix) {
- for (int x = 0; x < pix; ++x) {
- uint8 b = src_bg24[0];
- uint8 g = src_bg24[1];
- uint8 r = src_bg24[2];
- dst_argb[0] = b;
- dst_argb[1] = g;
- dst_argb[2] = r;
- dst_argb[3] = 255u;
- dst_argb[3] = 255u;
- dst_argb += 4;
- src_bg24 += 3;
- }
-}
-
-// C versions do the same
-void RGB24ToYRow_C(const uint8* src_argb, uint8* dst_y, int pix) {
- SIMD_ALIGNED(uint8 row[kMaxStride]);
- BG24ToARGBRow_C(src_argb, row, pix);
- ARGBToYRow_C(row, dst_y, pix);
-}
-
-void RAWToYRow_C(const uint8* src_argb, uint8* dst_y, int pix) {
- SIMD_ALIGNED(uint8 row[kMaxStride]);
- RAWToARGBRow_C(src_argb, row, pix);
- ARGBToYRow_C(row, dst_y, pix);
-}
-
-void RGB24ToUVRow_C(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int pix) {
- SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
- BG24ToARGBRow_C(src_argb, row, pix);
- BG24ToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix);
- ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix);
-}
-
-void RAWToUVRow_C(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int pix) {
- SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
- RAWToARGBRow_C(src_argb, row, pix);
- RAWToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix);
- ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix);
-}
-
-static inline int RGBToY(uint8 r, uint8 g, uint8 b) {
- return (( 66 * r + 129 * g + 25 * b + 128) >> 8) + 16;
-}
-
-static inline int RGBToU(uint8 r, uint8 g, uint8 b) {
- return ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128;
-}
-static inline int RGBToV(uint8 r, uint8 g, uint8 b) {
- return ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128;
-}
-
-#define MAKEROWY(NAME,R,G,B) \
-void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \
- for (int x = 0; x < width; ++x) { \
- dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \
- src_argb0 += 4; \
- dst_y += 1; \
- } \
-} \
-void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb, \
- uint8* dst_u, uint8* dst_v, int width) { \
- const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \
- for (int x = 0; x < width - 1; x += 2) { \
- uint8 ab = (src_rgb0[B] + src_rgb0[B + 4] + \
- src_rgb1[B] + src_rgb1[B + 4]) >> 2; \
- uint8 ag = (src_rgb0[G] + src_rgb0[G + 4] + \
- src_rgb1[G] + src_rgb1[G + 4]) >> 2; \
- uint8 ar = (src_rgb0[R] + src_rgb0[R + 4] + \
- src_rgb1[R] + src_rgb1[R + 4]) >> 2; \
- dst_u[0] = RGBToU(ar, ag, ab); \
- dst_v[0] = RGBToV(ar, ag, ab); \
- src_rgb0 += 8; \
- src_rgb1 += 8; \
- dst_u += 1; \
- dst_v += 1; \
- } \
- if (width & 1) { \
- uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \
- uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \
- uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \
- dst_u[0] = RGBToU(ar, ag, ab); \
- dst_v[0] = RGBToV(ar, ag, ab); \
- } \
-}
-
-MAKEROWY(ARGB,2,1,0)
-MAKEROWY(BGRA,1,2,3)
-MAKEROWY(ABGR,0,1,2)
-
-#if defined(HAS_RAWTOYROW_SSSE3)
-
-void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
- SIMD_ALIGNED(uint8 row[kMaxStride]);
- BG24ToARGBRow_SSSE3(src_argb, row, pix);
- ARGBToYRow_SSSE3(row, dst_y, pix);
-}
-
-void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
- SIMD_ALIGNED(uint8 row[kMaxStride]);
- RAWToARGBRow_SSSE3(src_argb, row, pix);
- ARGBToYRow_SSSE3(row, dst_y, pix);
-}
-
-#endif
-
-#if defined(HAS_RAWTOUVROW_SSSE3)
-#if defined(HAS_ARGBTOUVROW_SSSE3)
-void RGB24ToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int pix) {
- SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
- BG24ToARGBRow_SSSE3(src_argb, row, pix);
- BG24ToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
- ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
-}
-
-void RAWToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int pix) {
- SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
- RAWToARGBRow_SSSE3(src_argb, row, pix);
- RAWToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
- ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
-}
-
-#else
-
-void RGB24ToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int pix) {
- SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
- BG24ToARGBRow_SSSE3(src_argb, row, pix);
- BG24ToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
- ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix);
-}
-
-void RAWToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int pix) {
- SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
- RAWToARGBRow_SSSE3(src_argb, row, pix);
- RAWToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
- ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix);
-}
-
-#endif
-#endif
-
-} // extern "C"
diff --git a/files/source/row_win.cc b/files/source/row_win.cc
index 2bc5fb13..e3b01f27 100644
--- a/files/source/row_win.cc
+++ b/files/source/row_win.cc
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@@ -8,173 +8,925 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "row.h"
+#include "libyuv/row.h"
+#ifdef __cplusplus
+namespace libyuv {
extern "C" {
+#endif
+
+// This module is for Visual C x86.
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
+// TODO(fbarchard): I420ToRGB24, I420ToRAW
#ifdef HAS_ARGBTOYROW_SSSE3
-#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
-// Constant multiplication table for converting ARGB to I400.
-extern "C" TALIGN16(const int8, kARGBToY[16]) = {
+// Constants for ARGB.
+static const vec8 kARGBToY = {
13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
};
-extern "C" TALIGN16(const int8, kARGBToU[16]) = {
+static const vec8 kARGBToU = {
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
};
-extern "C" TALIGN16(const int8, kARGBToV[16]) = {
+static const vec8 kARGBToV = {
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
};
-// Constants for BGRA
-extern "C" TALIGN16(const int8, kBGRAToY[16]) = {
+// Constants for BGRA.
+static const vec8 kBGRAToY = {
0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
};
-extern "C" TALIGN16(const int8, kBGRAToU[16]) = {
+static const vec8 kBGRAToU = {
0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
};
-extern "C" TALIGN16(const int8, kBGRAToV[16]) = {
+static const vec8 kBGRAToV = {
0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
};
-// Constants for ABGR
-extern "C" TALIGN16(const int8, kABGRToY[16]) = {
+// Constants for ABGR.
+static const vec8 kABGRToY = {
33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
};
-extern "C" TALIGN16(const int8, kABGRToU[16]) = {
+static const vec8 kABGRToU = {
-38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
};
-extern "C" TALIGN16(const int8, kABGRToV[16]) = {
+static const vec8 kABGRToV = {
112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
};
-extern "C" TALIGN16(const uint8, kAddY16[16]) = {
- 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
- 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
+// Constants for RGBA.
+static const vec8 kRGBAToY = {
+ 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
+};
+
+static const vec8 kRGBAToU = {
+ 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
+};
+
+static const vec8 kRGBAToV = {
+ 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
+};
+
+static const uvec8 kAddY16 = {
+ 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
};
-extern "C" TALIGN16(const uint8, kAddUV128[16]) = {
+static const uvec8 kAddUV128 = {
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
};
-// Shuffle table for converting BG24 to ARGB.
-extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
+// Shuffle table for converting RGB24 to ARGB.
+static const uvec8 kShuffleMaskRGB24ToARGB = {
0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
};
// Shuffle table for converting RAW to ARGB.
-extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
+static const uvec8 kShuffleMaskRAWToARGB = {
2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
};
-// Convert 16 ARGB pixels (64 bytes) to 16 Y values
-__declspec(naked)
+// Shuffle table for converting BGRA to ARGB.
+static const uvec8 kShuffleMaskBGRAToARGB = {
+ 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
+};
+
+// Shuffle table for converting ABGR to ARGB.
+static const uvec8 kShuffleMaskABGRToARGB = {
+ 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
+};
+
+// Shuffle table for converting RGBA to ARGB.
+static const uvec8 kShuffleMaskRGBAToARGB = {
+ 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
+};
+
+// Shuffle table for converting ARGB to RGBA.
+static const uvec8 kShuffleMaskARGBToRGBA = {
+ 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
+};
+
+// Shuffle table for converting ARGB to RGB24.
+static const uvec8 kShuffleMaskARGBToRGB24 = {
+ 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGB to RAW.
+static const uvec8 kShuffleMaskARGBToRAW = {
+ 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
+};
+
+__declspec(naked) __declspec(align(16))
+void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_y
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0xff000000
+ pslld xmm5, 24
+
+ align 16
+ convertloop:
+ movq xmm0, qword ptr [eax]
+ lea eax, [eax + 8]
+ punpcklbw xmm0, xmm0
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm0
+ punpckhwd xmm1, xmm1
+ por xmm0, xmm5
+ por xmm1, xmm5
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
+__asm {
+ mov eax, [esp + 4] // src_bgra
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // pix
+ movdqa xmm5, kShuffleMaskBGRAToARGB
+ sub edx, eax
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax]
+ pshufb xmm0, xmm5
+ sub ecx, 4
+ movdqa [eax + edx], xmm0
+ lea eax, [eax + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
+__asm {
+ mov eax, [esp + 4] // src_abgr
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // pix
+ movdqa xmm5, kShuffleMaskABGRToARGB
+ sub edx, eax
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax]
+ pshufb xmm0, xmm5
+ sub ecx, 4
+ movdqa [eax + edx], xmm0
+ lea eax, [eax + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) {
+__asm {
+ mov eax, [esp + 4] // src_rgba
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // pix
+ movdqa xmm5, kShuffleMaskRGBAToARGB
+ sub edx, eax
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax]
+ pshufb xmm0, xmm5
+ sub ecx, 4
+ movdqa [eax + edx], xmm0
+ lea eax, [eax + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) {
+__asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgba
+ mov ecx, [esp + 12] // pix
+ movdqa xmm5, kShuffleMaskARGBToRGBA
+ sub edx, eax
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax]
+ pshufb xmm0, xmm5
+ sub ecx, 4
+ movdqa [eax + edx], xmm0
+ lea eax, [eax + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
+__asm {
+ mov eax, [esp + 4] // src_rgb24
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0xff000000
+ pslld xmm5, 24
+ movdqa xmm4, kShuffleMaskRGB24ToARGB
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm3, [eax + 32]
+ lea eax, [eax + 48]
+ movdqa xmm2, xmm3
+ palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
+ pshufb xmm2, xmm4
+ por xmm2, xmm5
+ palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
+ pshufb xmm0, xmm4
+ movdqa [edx + 32], xmm2
+ por xmm0, xmm5
+ pshufb xmm1, xmm4
+ movdqa [edx], xmm0
+ por xmm1, xmm5
+ palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
+ pshufb xmm3, xmm4
+ movdqa [edx + 16], xmm1
+ por xmm3, xmm5
+ sub ecx, 16
+ movdqa [edx + 48], xmm3
+ lea edx, [edx + 64]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
+ int pix) {
+__asm {
+ mov eax, [esp + 4] // src_raw
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0xff000000
+ pslld xmm5, 24
+ movdqa xmm4, kShuffleMaskRAWToARGB
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm3, [eax + 32]
+ lea eax, [eax + 48]
+ movdqa xmm2, xmm3
+ palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
+ pshufb xmm2, xmm4
+ por xmm2, xmm5
+ palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
+ pshufb xmm0, xmm4
+ movdqa [edx + 32], xmm2
+ por xmm0, xmm5
+ pshufb xmm1, xmm4
+ movdqa [edx], xmm0
+ por xmm1, xmm5
+ palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
+ pshufb xmm3, xmm4
+ movdqa [edx + 16], xmm1
+ por xmm3, xmm5
+ sub ecx, 16
+ movdqa [edx + 48], xmm3
+ lea edx, [edx + 64]
+ jg convertloop
+ ret
+ }
+}
+
+// pmul method to replicate bits.
+// Math to replicate bits:
+// (v << 8) | (v << 3)
+// v * 256 + v * 8
+// v * (256 + 8)
+// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
+// 20 instructions.
+__declspec(naked) __declspec(align(16))
+void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
+ int pix) {
+__asm {
+ mov eax, 0x01080108 // generate multiplier to repeat 5 bits
+ movd xmm5, eax
+ pshufd xmm5, xmm5, 0
+ mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
+ movd xmm6, eax
+ pshufd xmm6, xmm6, 0
+ pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
+ psllw xmm3, 11
+ pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green
+ psllw xmm4, 10
+ psrlw xmm4, 5
+ pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
+ psllw xmm7, 8
+
+ mov eax, [esp + 4] // src_rgb565
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // pix
+ sub edx, eax
+ sub edx, eax
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax] // fetch 8 pixels of bgr565
+ movdqa xmm1, xmm0
+ movdqa xmm2, xmm0
+ pand xmm1, xmm3 // R in upper 5 bits
+ psllw xmm2, 11 // B in upper 5 bits
+ pmulhuw xmm1, xmm5 // * (256 + 8)
+ pmulhuw xmm2, xmm5 // * (256 + 8)
+ psllw xmm1, 8
+ por xmm1, xmm2 // RB
+ pand xmm0, xmm4 // G in middle 6 bits
+ pmulhuw xmm0, xmm6 // << 5 * (256 + 4)
+ por xmm0, xmm7 // AG
+ movdqa xmm2, xmm1
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm2, xmm0
+ movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
+ movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
+ lea eax, [eax + 16]
+ sub ecx, 8
+ jg convertloop
+ ret
+ }
+}
+
+// 24 instructions
+__declspec(naked) __declspec(align(16))
+void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
+ int pix) {
+__asm {
+ mov eax, 0x01080108 // generate multiplier to repeat 5 bits
+ movd xmm5, eax
+ pshufd xmm5, xmm5, 0
+ mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
+ movd xmm6, eax
+ pshufd xmm6, xmm6, 0
+ pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
+ psllw xmm3, 11
+ movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green
+ psrlw xmm4, 6
+ pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
+ psllw xmm7, 8
+
+ mov eax, [esp + 4] // src_argb1555
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // pix
+ sub edx, eax
+ sub edx, eax
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax] // fetch 8 pixels of 1555
+ movdqa xmm1, xmm0
+ movdqa xmm2, xmm0
+ psllw xmm1, 1 // R in upper 5 bits
+ psllw xmm2, 11 // B in upper 5 bits
+ pand xmm1, xmm3
+ pmulhuw xmm2, xmm5 // * (256 + 8)
+ pmulhuw xmm1, xmm5 // * (256 + 8)
+ psllw xmm1, 8
+ por xmm1, xmm2 // RB
+ movdqa xmm2, xmm0
+ pand xmm0, xmm4 // G in middle 5 bits
+ psraw xmm2, 8 // A
+ pmulhuw xmm0, xmm6 // << 6 * (256 + 8)
+ pand xmm2, xmm7
+ por xmm0, xmm2 // AG
+ movdqa xmm2, xmm1
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm2, xmm0
+ movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
+ movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
+ lea eax, [eax + 16]
+ sub ecx, 8
+ jg convertloop
+ ret
+ }
+}
+
+// 18 instructions.
+__declspec(naked) __declspec(align(16))
+void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
+ int pix) {
+__asm {
+ mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
+ movd xmm4, eax
+ pshufd xmm4, xmm4, 0
+ movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles
+ pslld xmm5, 4
+ mov eax, [esp + 4] // src_argb4444
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // pix
+ sub edx, eax
+ sub edx, eax
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax] // fetch 8 pixels of bgra4444
+ movdqa xmm2, xmm0
+ pand xmm0, xmm4 // mask low nibbles
+ pand xmm2, xmm5 // mask high nibbles
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ psllw xmm1, 4
+ psrlw xmm3, 4
+ por xmm0, xmm1
+ por xmm2, xmm3
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm2
+ punpckhbw xmm1, xmm2
+ movdqa [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
+ movdqa [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
+ lea eax, [eax + 16]
+ sub ecx, 8
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
+__asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // pix
+ movdqa xmm6, kShuffleMaskARGBToRGB24
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax] // fetch 16 pixels of argb
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ lea eax, [eax + 64]
+ pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
+ pshufb xmm1, xmm6
+ pshufb xmm2, xmm6
+ pshufb xmm3, xmm6
+ movdqa xmm4, xmm1 // 4 bytes from 1 for 0
+ psrldq xmm1, 4 // 8 bytes from 1
+ pslldq xmm4, 12 // 4 bytes from 1 for 0
+ movdqa xmm5, xmm2 // 8 bytes from 2 for 1
+ por xmm0, xmm4 // 4 bytes from 1 for 0
+ pslldq xmm5, 8 // 8 bytes from 2 for 1
+ movdqa [edx], xmm0 // store 0
+ por xmm1, xmm5 // 8 bytes from 2 for 1
+ psrldq xmm2, 8 // 4 bytes from 2
+ pslldq xmm3, 4 // 12 bytes from 3 for 2
+ por xmm2, xmm3 // 12 bytes from 3 for 2
+ movdqa [edx + 16], xmm1 // store 1
+ movdqa [edx + 32], xmm2 // store 2
+ lea edx, [edx + 48]
+ sub ecx, 16
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
+__asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // pix
+ movdqa xmm6, kShuffleMaskARGBToRAW
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax] // fetch 16 pixels of argb
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ lea eax, [eax + 64]
+ pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
+ pshufb xmm1, xmm6
+ pshufb xmm2, xmm6
+ pshufb xmm3, xmm6
+ movdqa xmm4, xmm1 // 4 bytes from 1 for 0
+ psrldq xmm1, 4 // 8 bytes from 1
+ pslldq xmm4, 12 // 4 bytes from 1 for 0
+ movdqa xmm5, xmm2 // 8 bytes from 2 for 1
+ por xmm0, xmm4 // 4 bytes from 1 for 0
+ pslldq xmm5, 8 // 8 bytes from 2 for 1
+ movdqa [edx], xmm0 // store 0
+ por xmm1, xmm5 // 8 bytes from 2 for 1
+ psrldq xmm2, 8 // 4 bytes from 2
+ pslldq xmm3, 4 // 12 bytes from 3 for 2
+ por xmm2, xmm3 // 12 bytes from 3 for 2
+ movdqa [edx + 16], xmm1 // store 1
+ movdqa [edx + 32], xmm2 // store 2
+ lea edx, [edx + 48]
+ sub ecx, 16
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+__asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // pix
+ pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
+ psrld xmm3, 27
+ pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
+ psrld xmm4, 26
+ pslld xmm4, 5
+ pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
+ pslld xmm5, 11
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax] // fetch 4 pixels of argb
+ movdqa xmm1, xmm0 // B
+ movdqa xmm2, xmm0 // G
+ pslld xmm0, 8 // R
+ psrld xmm1, 3 // B
+ psrld xmm2, 5 // G
+ psrad xmm0, 16 // R
+ pand xmm1, xmm3 // B
+ pand xmm2, xmm4 // G
+ pand xmm0, xmm5 // R
+ por xmm1, xmm2 // BG
+ por xmm0, xmm1 // BGR
+ packssdw xmm0, xmm0
+ lea eax, [eax + 16]
+ movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
+ lea edx, [edx + 8]
+ sub ecx, 4
+ jg convertloop
+ ret
+ }
+}
+
+// TODO(fbarchard): Improve sign extension/packing.
+__declspec(naked) __declspec(align(16))
+void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+__asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // pix
+ pcmpeqb xmm4, xmm4 // generate mask 0x0000001f
+ psrld xmm4, 27
+ movdqa xmm5, xmm4 // generate mask 0x000003e0
+ pslld xmm5, 5
+ movdqa xmm6, xmm4 // generate mask 0x00007c00
+ pslld xmm6, 10
+ pcmpeqb xmm7, xmm7 // generate mask 0xffff8000
+ pslld xmm7, 15
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax] // fetch 4 pixels of argb
+ movdqa xmm1, xmm0 // B
+ movdqa xmm2, xmm0 // G
+ movdqa xmm3, xmm0 // R
+ psrad xmm0, 16 // A
+ psrld xmm1, 3 // B
+ psrld xmm2, 6 // G
+ psrld xmm3, 9 // R
+ pand xmm0, xmm7 // A
+ pand xmm1, xmm4 // B
+ pand xmm2, xmm5 // G
+ pand xmm3, xmm6 // R
+ por xmm0, xmm1 // BA
+ por xmm2, xmm3 // GR
+ por xmm0, xmm2 // BGRA
+ packssdw xmm0, xmm0
+ lea eax, [eax + 16]
+ movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
+ lea edx, [edx + 8]
+ sub ecx, 4
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+__asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // pix
+ pcmpeqb xmm4, xmm4 // generate mask 0xf000f000
+ psllw xmm4, 12
+ movdqa xmm3, xmm4 // generate mask 0x00f000f0
+ psrlw xmm3, 8
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax] // fetch 4 pixels of argb
+ movdqa xmm1, xmm0
+ pand xmm0, xmm3 // low nibble
+ pand xmm1, xmm4 // high nibble
+ psrl xmm0, 4
+ psrl xmm1, 8
+ por xmm0, xmm1
+ packuswb xmm0, xmm0
+ lea eax, [eax + 16]
+ movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444
+ lea edx, [edx + 8]
+ sub ecx, 4
+ jg convertloop
+ ret
+ }
+}
+
+// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
+__declspec(naked) __declspec(align(16))
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */
- movdqa xmm7, _kARGBToY
- movdqa xmm6, _kAddY16
+ movdqa xmm5, kAddY16
+ movdqa xmm4, kARGBToY
- convertloop :
+ align 16
+ convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
- pmaddubsw xmm0, xmm7
- pmaddubsw xmm1, xmm7
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm3, xmm7
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
lea eax, [eax + 64]
phaddw xmm0, xmm1
phaddw xmm2, xmm3
psrlw xmm0, 7
psrlw xmm2, 7
packuswb xmm0, xmm2
- paddb xmm0, xmm6
+ paddb xmm0, xmm5
+ sub ecx, 16
movdqa [edx], xmm0
lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+__asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* pix */
+ movdqa xmm5, kAddY16
+ movdqa xmm4, kARGBToY
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ paddb xmm0, xmm5
sub ecx, 16
- ja convertloop
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
ret
}
}
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */
- movdqa xmm7, _kBGRAToY
- movdqa xmm6, _kAddY16
+ movdqa xmm5, kAddY16
+ movdqa xmm4, kBGRAToY
- convertloop :
+ align 16
+ convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
- pmaddubsw xmm0, xmm7
- pmaddubsw xmm1, xmm7
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm3, xmm7
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
lea eax, [eax + 64]
phaddw xmm0, xmm1
phaddw xmm2, xmm3
psrlw xmm0, 7
psrlw xmm2, 7
packuswb xmm0, xmm2
- paddb xmm0, xmm6
+ paddb xmm0, xmm5
+ sub ecx, 16
movdqa [edx], xmm0
lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+__asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* pix */
+ movdqa xmm5, kAddY16
+ movdqa xmm4, kBGRAToY
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ paddb xmm0, xmm5
sub ecx, 16
- ja convertloop
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
ret
}
}
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */
- movdqa xmm7, _kABGRToY
- movdqa xmm6, _kAddY16
+ movdqa xmm5, kAddY16
+ movdqa xmm4, kABGRToY
- convertloop :
+ align 16
+ convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + 32]
movdqa xmm3, [eax + 48]
- pmaddubsw xmm0, xmm7
- pmaddubsw xmm1, xmm7
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm3, xmm7
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ paddb xmm0, xmm5
+ sub ecx, 16
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+__asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* pix */
+ movdqa xmm5, kAddY16
+ movdqa xmm4, kABGRToY
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
lea eax, [eax + 64]
phaddw xmm0, xmm1
phaddw xmm2, xmm3
psrlw xmm0, 7
psrlw xmm2, 7
packuswb xmm0, xmm2
- paddb xmm0, xmm6
+ paddb xmm0, xmm5
+ sub ecx, 16
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+__asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* pix */
+ movdqa xmm5, kAddY16
+ movdqa xmm4, kRGBAToY
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ paddb xmm0, xmm5
+ sub ecx, 16
movdqa [edx], xmm0
lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+__asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* pix */
+ movdqa xmm5, kAddY16
+ movdqa xmm4, kRGBAToY
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ paddb xmm0, xmm5
sub ecx, 16
- ja convertloop
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
ret
}
}
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
__asm {
@@ -185,12 +937,13 @@ __asm {
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, _kARGBToU
- movdqa xmm6, _kARGBToV
- movdqa xmm5, _kAddUV128
+ movdqa xmm7, kARGBToU
+ movdqa xmm6, kARGBToV
+ movdqa xmm5, kAddUV128
sub edi, edx // stride from u to v
- convertloop :
+ align 16
+ convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
@@ -227,18 +980,89 @@ __asm {
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
+ sub ecx, 16
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+__asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ movdqa xmm7, kARGBToU
+ movdqa xmm6, kARGBToV
+ movdqa xmm5, kAddUV128
+ sub edi, edx // stride from u to v
+
+ align 16
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ movdqu xmm4, [eax + esi]
+ pavgb xmm0, xmm4
+ movdqu xmm4, [eax + esi + 16]
+ pavgb xmm1, xmm4
+ movdqu xmm4, [eax + esi + 32]
+ pavgb xmm2, xmm4
+ movdqu xmm4, [eax + esi + 48]
+ pavgb xmm3, xmm4
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
sub ecx, 16
- ja convertloop
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ jg convertloop
+
pop edi
pop esi
ret
}
}
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
__asm {
@@ -249,12 +1073,13 @@ __asm {
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, _kBGRAToU
- movdqa xmm6, _kBGRAToV
- movdqa xmm5, _kAddUV128
+ movdqa xmm7, kBGRAToU
+ movdqa xmm6, kBGRAToV
+ movdqa xmm5, kAddUV128
sub edi, edx // stride from u to v
- convertloop :
+ align 16
+ convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
@@ -291,18 +1116,89 @@ __asm {
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
+ sub ecx, 16
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+__asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ movdqa xmm7, kBGRAToU
+ movdqa xmm6, kBGRAToV
+ movdqa xmm5, kAddUV128
+ sub edi, edx // stride from u to v
+
+ align 16
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ movdqu xmm4, [eax + esi]
+ pavgb xmm0, xmm4
+ movdqu xmm4, [eax + esi + 16]
+ pavgb xmm1, xmm4
+ movdqu xmm4, [eax + esi + 32]
+ pavgb xmm2, xmm4
+ movdqu xmm4, [eax + esi + 48]
+ pavgb xmm3, xmm4
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
sub ecx, 16
- ja convertloop
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ jg convertloop
+
pop edi
pop esi
ret
}
}
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
__asm {
@@ -313,12 +1209,13 @@ __asm {
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, _kABGRToU
- movdqa xmm6, _kABGRToV
- movdqa xmm5, _kAddUV128
+ movdqa xmm7, kABGRToU
+ movdqa xmm6, kABGRToV
+ movdqa xmm5, kAddUV128
sub edi, edx // stride from u to v
- convertloop :
+ align 16
+ convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
@@ -355,282 +1252,2846 @@ __asm {
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
+ sub ecx, 16
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+__asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ movdqa xmm7, kABGRToU
+ movdqa xmm6, kABGRToV
+ movdqa xmm5, kAddUV128
+ sub edi, edx // stride from u to v
+
+ align 16
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ movdqu xmm4, [eax + esi]
+ pavgb xmm0, xmm4
+ movdqu xmm4, [eax + esi + 16]
+ pavgb xmm1, xmm4
+ movdqu xmm4, [eax + esi + 32]
+ pavgb xmm2, xmm4
+ movdqu xmm4, [eax + esi + 48]
+ pavgb xmm3, xmm4
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
sub ecx, 16
- ja convertloop
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ jg convertloop
+
pop edi
pop esi
ret
}
}
-__declspec(naked)
-void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
+__declspec(naked) __declspec(align(16))
+void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
__asm {
- mov eax, [esp + 4] // src_bg24
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // pix
- pcmpeqb xmm7, xmm7 // generate mask 0xff000000
- pslld xmm7, 24
- movdqa xmm6, _kShuffleMaskBG24ToARGB
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ movdqa xmm7, kRGBAToU
+ movdqa xmm6, kRGBAToV
+ movdqa xmm5, kAddUV128
+ sub edi, edx // stride from u to v
- convertloop :
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm3, [eax + 32]
- lea eax, [eax + 48]
- movdqa xmm2, xmm3
- palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
- pshufb xmm2, xmm6
- por xmm2, xmm7
- palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
- pshufb xmm0, xmm6
- movdqa [edx + 32], xmm2
- por xmm0, xmm7
- pshufb xmm1, xmm6
- movdqa [edx], xmm0
- por xmm1, xmm7
- palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
- pshufb xmm3, xmm6
- movdqa [edx + 16], xmm1
- por xmm3, xmm7
- movdqa [edx + 48], xmm3
- lea edx, [edx + 64]
+ align 16
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ pavgb xmm0, [eax + esi]
+ pavgb xmm1, [eax + esi + 16]
+ pavgb xmm2, [eax + esi + 32]
+ pavgb xmm3, [eax + esi + 48]
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
+ sub ecx, 16
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+__asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ movdqa xmm7, kRGBAToU
+ movdqa xmm6, kRGBAToV
+ movdqa xmm5, kAddUV128
+ sub edi, edx // stride from u to v
+
+ align 16
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ movdqu xmm4, [eax + esi]
+ pavgb xmm0, xmm4
+ movdqu xmm4, [eax + esi + 16]
+ pavgb xmm1, xmm4
+ movdqu xmm4, [eax + esi + 32]
+ pavgb xmm2, xmm4
+ movdqu xmm4, [eax + esi + 48]
+ pavgb xmm3, xmm4
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
+ sub ecx, 16
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBTOYROW_SSSE3
+
+#ifdef HAS_I422TOARGBROW_SSSE3
+
+#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
+
+#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
+#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
+#define UR 0
+
+#define VB 0
+#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
+#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
+
+// Bias
+#define BB UB * 128 + VB * 128
+#define BG UG * 128 + VG * 128
+#define BR UR * 128 + VR * 128
+
+static const vec8 kUVToB = {
+ UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
+};
+
+static const vec8 kUVToR = {
+ UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
+};
+
+static const vec8 kUVToG = {
+ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
+};
+
+static const vec8 kVUToB = {
+ VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
+};
+
+static const vec8 kVUToR = {
+ VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
+};
+
+static const vec8 kVUToG = {
+ VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+};
+
+static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
+static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
+static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
+static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
+static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
+
+// TODO(fbarchard): NV12/NV21 fetch UV and use directly.
+// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
+
+// Read 8 UV from 411.
+#define READYUV444 __asm { \
+ __asm movq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \
+ __asm movq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \
+ __asm lea esi, [esi + 8] \
+ __asm punpcklbw xmm0, xmm1 /* UV */ \
+ }
+
+// Read 4 UV from 422, upsample to 8 UV.
+#define READYUV422 __asm { \
+ __asm movd xmm0, [esi] /* U */ \
+ __asm movd xmm1, [esi + edi] /* V */ \
+ __asm lea esi, [esi + 4] \
+ __asm punpcklbw xmm0, xmm1 /* UV */ \
+ __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
+ }
+
+// Read 2 UV from 411, upsample to 8 UV.
+#define READYUV411 __asm { \
+ __asm movd xmm0, [esi] /* U */ \
+ __asm movd xmm1, [esi + edi] /* V */ \
+ __asm lea esi, [esi + 2] \
+ __asm punpcklbw xmm0, xmm1 /* UV */ \
+ __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
+ __asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \
+ }
+
+// Read 4 UV from NV12, upsample to 8 UV.
+#define READNV12 __asm { \
+ __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \
+ __asm lea esi, [esi + 8] \
+ __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
+ }
+
+// Convert 8 pixels: 8 UV and 8 Y.
+#define YUVTORGB __asm { \
+ /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
+ __asm movdqa xmm1, xmm0 \
+ __asm movdqa xmm2, xmm0 \
+ __asm pmaddubsw xmm0, kUVToB /* scale B UV */ \
+ __asm pmaddubsw xmm1, kUVToG /* scale G UV */ \
+ __asm pmaddubsw xmm2, kUVToR /* scale R UV */ \
+ __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
+ __asm psubw xmm1, kUVBiasG \
+ __asm psubw xmm2, kUVBiasR \
+ /* Step 2: Find Y contribution to 8 R,G,B values */ \
+ __asm movq xmm3, qword ptr [eax] /* NOLINT */ \
+ __asm lea eax, [eax + 8] \
+ __asm punpcklbw xmm3, xmm4 \
+ __asm psubsw xmm3, kYSub16 \
+ __asm pmullw xmm3, kYToRgb \
+ __asm paddsw xmm0, xmm3 /* B += Y */ \
+ __asm paddsw xmm1, xmm3 /* G += Y */ \
+ __asm paddsw xmm2, xmm3 /* R += Y */ \
+ __asm psraw xmm0, 6 \
+ __asm psraw xmm1, 6 \
+ __asm psraw xmm2, 6 \
+ __asm packuswb xmm0, xmm0 /* B */ \
+ __asm packuswb xmm1, xmm1 /* G */ \
+ __asm packuswb xmm2, xmm2 /* R */ \
+ }
+
+// Convert 8 pixels: 8 VU and 8 Y.
+#define YVUTORGB __asm { \
+ /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
+ __asm movdqa xmm1, xmm0 \
+ __asm movdqa xmm2, xmm0 \
+ __asm pmaddubsw xmm0, kVUToB /* scale B UV */ \
+ __asm pmaddubsw xmm1, kVUToG /* scale G UV */ \
+ __asm pmaddubsw xmm2, kVUToR /* scale R UV */ \
+ __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
+ __asm psubw xmm1, kUVBiasG \
+ __asm psubw xmm2, kUVBiasR \
+ /* Step 2: Find Y contribution to 8 R,G,B values */ \
+ __asm movq xmm3, qword ptr [eax] /* NOLINT */ \
+ __asm lea eax, [eax + 8] \
+ __asm punpcklbw xmm3, xmm4 \
+ __asm psubsw xmm3, kYSub16 \
+ __asm pmullw xmm3, kYToRgb \
+ __asm paddsw xmm0, xmm3 /* B += Y */ \
+ __asm paddsw xmm1, xmm3 /* G += Y */ \
+ __asm paddsw xmm2, xmm3 /* R += Y */ \
+ __asm psraw xmm0, 6 \
+ __asm psraw xmm1, 6 \
+ __asm psraw xmm2, 6 \
+ __asm packuswb xmm0, xmm0 /* B */ \
+ __asm packuswb xmm1, xmm1 /* G */ \
+ __asm packuswb xmm2, xmm2 /* R */ \
+ }
+
+// 8 pixels, dest aligned 16.
+// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void I444ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* argb_buf,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // argb
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 16
+ convertloop:
+ READYUV444
+ YUVTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm5 // RA
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRA next 4 pixels
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels, dest aligned 16.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void I422ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* argb_buf,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // argb
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 16
+ convertloop:
+ READYUV422
+ YUVTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm5 // RA
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRA next 4 pixels
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels, dest aligned 16.
+// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+// Similar to I420 but duplicate UV once more.
+__declspec(naked) __declspec(align(16))
+void I411ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* argb_buf,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // argb
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 16
+ convertloop:
+ READYUV411
+ YUVTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm5 // RA
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRA next 4 pixels
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels, dest aligned 16.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void NV12ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* argb_buf,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // Y
+ mov esi, [esp + 4 + 8] // UV
+ mov edx, [esp + 4 + 12] // argb
+ mov ecx, [esp + 4 + 16] // width
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 16
+ convertloop:
+ READNV12
+ YUVTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm5 // RA
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRA next 4 pixels
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels, dest aligned 16.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void NV21ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* argb_buf,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // Y
+ mov esi, [esp + 4 + 8] // VU
+ mov edx, [esp + 4 + 12] // argb
+ mov ecx, [esp + 4 + 16] // width
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 16
+ convertloop:
+ READNV12
+ YVUTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm5 // RA
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRA next 4 pixels
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels, unaligned.
+// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* argb_buf,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // argb
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 16
+ convertloop:
+ READYUV444
+ YUVTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm5 // RA
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRA next 4 pixels
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels, unaligned.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* argb_buf,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // argb
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 16
+ convertloop:
+ READYUV422
+ YUVTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm5 // RA
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRA next 4 pixels
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels, unaligned.
+// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+// Similar to I420 but duplicate UV once more.
+__declspec(naked) __declspec(align(16))
+void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* argb_buf,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // argb
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 16
+ convertloop:
+ READYUV411
+ YUVTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm5 // RA
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRA next 4 pixels
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+
+// 8 pixels, dest aligned 16.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* argb_buf,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // Y
+ mov esi, [esp + 4 + 8] // UV
+ mov edx, [esp + 4 + 12] // argb
+ mov ecx, [esp + 4 + 16] // width
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 16
+ convertloop:
+ READNV12
+ YUVTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm5 // RA
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRA next 4 pixels
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels, dest aligned 16.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* argb_buf,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // Y
+ mov esi, [esp + 4 + 8] // VU
+ mov edx, [esp + 4 + 12] // argb
+ mov ecx, [esp + 4 + 16] // width
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 16
+ convertloop:
+ READNV12
+ YVUTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm5 // RA
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRA next 4 pixels
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void I422ToBGRARow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* bgra_buf,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // bgra
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pxor xmm4, xmm4
+
+ align 16
+ convertloop:
+ READYUV422
+ YUVTORGB
+
+ // Step 3: Weave into BGRA
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ punpcklbw xmm1, xmm0 // GB
+ punpcklbw xmm5, xmm2 // AR
+ movdqa xmm0, xmm5
+ punpcklwd xmm5, xmm1 // BGRA first 4 pixels
+ punpckhwd xmm0, xmm1 // BGRA next 4 pixels
+ movdqa [edx], xmm5
+ movdqa [edx + 16], xmm0
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* bgra_buf,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // bgra
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pxor xmm4, xmm4
+
+ align 16
+ convertloop:
+ READYUV422
+ YUVTORGB
+
+ // Step 3: Weave into BGRA
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ punpcklbw xmm1, xmm0 // GB
+ punpcklbw xmm5, xmm2 // AR
+ movdqa xmm0, xmm5
+ punpcklwd xmm5, xmm1 // BGRA first 4 pixels
+ punpckhwd xmm0, xmm1 // BGRA next 4 pixels
+ movdqu [edx], xmm5
+ movdqu [edx + 16], xmm0
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void I422ToABGRRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* abgr_buf,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // abgr
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 16
+ convertloop:
+ READYUV422
+ YUVTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm2, xmm1 // RG
+ punpcklbw xmm0, xmm5 // BA
+ movdqa xmm1, xmm2
+ punpcklwd xmm2, xmm0 // RGBA first 4 pixels
+ punpckhwd xmm1, xmm0 // RGBA next 4 pixels
+ movdqa [edx], xmm2
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* abgr_buf,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // abgr
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 16
+ convertloop:
+ READYUV422
+ YUVTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm2, xmm1 // RG
+ punpcklbw xmm0, xmm5 // BA
+ movdqa xmm1, xmm2
+ punpcklwd xmm2, xmm0 // RGBA first 4 pixels
+ punpckhwd xmm1, xmm0 // RGBA next 4 pixels
+ movdqu [edx], xmm2
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void I422ToRGBARow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgba_buf,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // rgba
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pxor xmm4, xmm4
+
+ align 16
+ convertloop:
+ READYUV422
+ YUVTORGB
+
+ // Step 3: Weave into RGBA
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ punpcklbw xmm1, xmm2 // GR
+ punpcklbw xmm5, xmm0 // AB
+ movdqa xmm0, xmm5
+ punpcklwd xmm5, xmm1 // RGBA first 4 pixels
+ punpckhwd xmm0, xmm1 // RGBA next 4 pixels
+ movdqa [edx], xmm5
+ movdqa [edx + 16], xmm0
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgba_buf,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // rgba
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pxor xmm4, xmm4
+
+ align 16
+ convertloop:
+ READYUV422
+ YUVTORGB
+
+ // Step 3: Weave into RGBA
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ punpcklbw xmm1, xmm2 // GR
+ punpcklbw xmm5, xmm0 // AB
+ movdqa xmm0, xmm5
+ punpcklwd xmm5, xmm1 // RGBA first 4 pixels
+ punpckhwd xmm0, xmm1 // RGBA next 4 pixels
+ movdqu [edx], xmm5
+ movdqu [edx + 16], xmm0
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+#endif // HAS_I422TOARGBROW_SSSE3
+
+#ifdef HAS_YTOARGBROW_SSE2
+__declspec(naked) __declspec(align(16))
+void YToARGBRow_SSE2(const uint8* y_buf,
+ uint8* rgb_buf,
+ int width) {
+ __asm {
+ pcmpeqb xmm4, xmm4 // generate mask 0xff000000
+ pslld xmm4, 24
+ mov eax,0x10001000
+ movd xmm3,eax
+ pshufd xmm3,xmm3,0
+ mov eax,0x012a012a
+ movd xmm2,eax
+ pshufd xmm2,xmm2,0
+ mov eax, [esp + 4] // Y
+ mov edx, [esp + 8] // rgb
+ mov ecx, [esp + 12] // width
+
+ align 16
+ convertloop:
+ // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+ movq xmm0, qword ptr [eax]
+ lea eax, [eax + 8]
+ punpcklbw xmm0, xmm0 // Y.Y
+ psubusw xmm0, xmm3
+ pmulhuw xmm0, xmm2
+ packuswb xmm0, xmm0 // G
+
+ // Step 2: Weave into ARGB
+ punpcklbw xmm0, xmm0 // GG
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm0 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm1 // BGRA next 4 pixels
+ por xmm0, xmm4
+ por xmm1, xmm4
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ ret
+ }
+}
+#endif // HAS_YTOARGBROW_SSE2
+
+#ifdef HAS_MIRRORROW_SSSE3
+
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleMirror = {
+ 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+
+__declspec(naked) __declspec(align(16))
+void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+__asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // width
+ movdqa xmm5, kShuffleMirror
+ lea eax, [eax - 16]
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax + ecx]
+ pshufb xmm0, xmm5
sub ecx, 16
- ja convertloop
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
ret
}
}
+#endif // HAS_MIRRORROW_SSSE3
-__declspec(naked)
-void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
- int pix) {
+#ifdef HAS_MIRRORROW_SSE2
+// SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
+// version can not.
+__declspec(naked) __declspec(align(16))
+void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
__asm {
- mov eax, [esp + 4] // src_raw
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // pix
- pcmpeqb xmm7, xmm7 // generate mask 0xff000000
- pslld xmm7, 24
- movdqa xmm6, _kShuffleMaskRAWToARGB
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // width
+ lea eax, [eax - 16]
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax + ecx]
+ movdqa xmm1, xmm0 // swap bytes
+ psllw xmm0, 8
+ psrlw xmm1, 8
+ por xmm0, xmm1
+ pshuflw xmm0, xmm0, 0x1b // swap words
+ pshufhw xmm0, xmm0, 0x1b
+ pshufd xmm0, xmm0, 0x4e // swap qwords
+ sub ecx, 16
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_MIRRORROW_SSE2
- convertloop :
+#ifdef HAS_MIRRORROW_UV_SSSE3
+// Shuffle table for reversing the bytes of UV channels.
+static const uvec8 kShuffleMirrorUV = {
+ 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
+};
+
+__declspec(naked) __declspec(align(16))
+void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
+ int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // width
+ movdqa xmm1, kShuffleMirrorUV
+ lea eax, [eax + ecx * 2 - 16]
+ sub edi, edx
+
+ align 16
+ convertloop:
movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm3, [eax + 32]
- lea eax, [eax + 48]
- movdqa xmm2, xmm3
- palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
- pshufb xmm2, xmm6
- por xmm2, xmm7
- palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
- pshufb xmm0, xmm6
- movdqa [edx + 32], xmm2
- por xmm0, xmm7
- pshufb xmm1, xmm6
+ lea eax, [eax - 16]
+ pshufb xmm0, xmm1
+ sub ecx, 8
+ movlpd qword ptr [edx], xmm0
+ movhpd qword ptr [edx + edi], xmm0
+ lea edx, [edx + 8]
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+#endif // HAS_MIRRORROW_UV_SSSE3
+
+#ifdef HAS_ARGBMIRRORROW_SSSE3
+
+// Shuffle table for reversing the bytes.
+static const uvec8 kARGBShuffleMirror = {
+ 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
+};
+
+__declspec(naked) __declspec(align(16))
+void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+__asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // width
+ movdqa xmm5, kARGBShuffleMirror
+ lea eax, [eax - 16]
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax + ecx * 4]
+ pshufb xmm0, xmm5
+ sub ecx, 4
movdqa [edx], xmm0
- por xmm1, xmm7
- palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
- pshufb xmm3, xmm6
- movdqa [edx + 16], xmm1
- por xmm3, xmm7
- movdqa [edx + 48], xmm3
- lea edx, [edx + 64]
- sub ecx, 16
- ja convertloop
+ lea edx, [edx + 16]
+ jg convertloop
ret
}
}
+#endif // HAS_ARGBMIRRORROW_SSSE3
-__declspec(naked)
-void FastConvertYUVToRGB32Row(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) {
+#ifdef HAS_SPLITUV_SSE2
+__declspec(naked) __declspec(align(16))
+void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
__asm {
- pushad
- mov edx, [esp + 32 + 4]
- mov edi, [esp + 32 + 8]
- mov esi, [esp + 32 + 12]
- mov ebp, [esp + 32 + 16]
- mov ecx, [esp + 32 + 20]
-
- convertloop :
- movzx eax, byte ptr [edi]
- lea edi, [edi + 1]
- movzx ebx, byte ptr [esi]
- lea esi, [esi + 1]
- movq mm0, [_kCoefficientsRgbY + 2048 + 8 * eax]
- movzx eax, byte ptr [edx]
- paddsw mm0, [_kCoefficientsRgbY + 4096 + 8 * ebx]
- movzx ebx, byte ptr [edx + 1]
- movq mm1, [_kCoefficientsRgbY + 8 * eax]
- lea edx, [edx + 2]
- movq mm2, [_kCoefficientsRgbY + 8 * ebx]
- paddsw mm1, mm0
- paddsw mm2, mm0
- psraw mm1, 6
- psraw mm2, 6
- packuswb mm1, mm2
- movntq [ebp], mm1
- lea ebp, [ebp + 8]
- sub ecx, 2
- ja convertloop
-
- popad
- ret
- }
-}
-
-__declspec(naked)
-void FastConvertYUVToBGRARow(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) {
+ push edi
+ mov eax, [esp + 4 + 4] // src_uv
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ pand xmm0, xmm5 // even bytes
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ psrlw xmm2, 8 // odd bytes
+ psrlw xmm3, 8
+ packuswb xmm2, xmm3
+ movdqa [edx], xmm0
+ movdqa [edx + edi], xmm2
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+#endif // HAS_SPLITUV_SSE2
+
+#ifdef HAS_COPYROW_SSE2
+// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
+__declspec(naked) __declspec(align(16))
+void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // count
+ sub edx, eax
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa [eax + edx], xmm0
+ movdqa [eax + edx + 16], xmm1
+ lea eax, [eax + 32]
+ sub ecx, 32
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_COPYROW_SSE2
+
+#ifdef HAS_COPYROW_X86
+__declspec(naked) __declspec(align(16))
+void CopyRow_X86(const uint8* src, uint8* dst, int count) {
+ __asm {
+ mov eax, esi
+ mov edx, edi
+ mov esi, [esp + 4] // src
+ mov edi, [esp + 8] // dst
+ mov ecx, [esp + 12] // count
+ shr ecx, 2
+ rep movsd
+ mov edi, edx
+ mov esi, eax
+ ret
+ }
+}
+#endif // HAS_COPYROW_X86
+
+#ifdef HAS_SETROW_X86
+// SetRow8 writes 'count' bytes using a 32 bit value repeated.
+__declspec(naked) __declspec(align(16))
+void SetRow8_X86(uint8* dst, uint32 v32, int count) {
+ __asm {
+ mov edx, edi
+ mov edi, [esp + 4] // dst
+ mov eax, [esp + 8] // v32
+ mov ecx, [esp + 12] // count
+ shr ecx, 2
+ rep stosd
+ mov edi, edx
+ ret
+ }
+}
+
+// SetRow32 writes 'count' words using a 32 bit value repeated.
+__declspec(naked) __declspec(align(16))
+void SetRows32_X86(uint8* dst, uint32 v32, int width,
+ int dst_stride, int height) {
+ __asm {
+ push esi
+ push edi
+ push ebp
+ mov edi, [esp + 12 + 4] // dst
+ mov eax, [esp + 12 + 8] // v32
+ mov ebp, [esp + 12 + 12] // width
+ mov edx, [esp + 12 + 16] // dst_stride
+ mov esi, [esp + 12 + 20] // height
+ lea ecx, [ebp * 4]
+ sub edx, ecx // stride - width * 4
+
+ align 16
+ convertloop:
+ mov ecx, ebp
+ rep stosd
+ add edi, edx
+ sub esi, 1
+ jg convertloop
+
+ pop ebp
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_SETROW_X86
+
+#ifdef HAS_YUY2TOYROW_SSE2
+__declspec(naked) __declspec(align(16))
+void YUY2ToYRow_SSE2(const uint8* src_yuy2,
+ uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_yuy2
+ mov edx, [esp + 8] // dst_y
+ mov ecx, [esp + 12] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pand xmm0, xmm5 // even bytes are Y
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ sub ecx, 16
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_yuy2
+ mov esi, [esp + 8 + 8] // stride_yuy2
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + esi]
+ movdqa xmm3, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm2
+ pavgb xmm1, xmm3
+ psrlw xmm0, 8 // YUYV -> UVUV
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ movdqa xmm1, xmm0
+ pand xmm0, xmm5 // U
+ packuswb xmm0, xmm0
+ psrlw xmm1, 8 // V
+ packuswb xmm1, xmm1
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + edi], xmm1
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_yuy2
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ psrlw xmm0, 8 // YUYV -> UVUV
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ movdqa xmm1, xmm0
+ pand xmm0, xmm5 // U
+ packuswb xmm0, xmm0
+ psrlw xmm1, 8 // V
+ packuswb xmm1, xmm1
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + edi], xmm1
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
+ uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_yuy2
+ mov edx, [esp + 8] // dst_y
+ mov ecx, [esp + 12] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pand xmm0, xmm5 // even bytes are Y
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ sub ecx, 16
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_yuy2
+ mov esi, [esp + 8 + 8] // stride_yuy2
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + esi]
+ movdqu xmm3, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm2
+ pavgb xmm1, xmm3
+ psrlw xmm0, 8 // YUYV -> UVUV
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ movdqa xmm1, xmm0
+ pand xmm0, xmm5 // U
+ packuswb xmm0, xmm0
+ psrlw xmm1, 8 // V
+ packuswb xmm1, xmm1
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + edi], xmm1
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_yuy2
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ psrlw xmm0, 8 // YUYV -> UVUV
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ movdqa xmm1, xmm0
+ pand xmm0, xmm5 // U
+ packuswb xmm0, xmm0
+ psrlw xmm1, 8 // V
+ packuswb xmm1, xmm1
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + edi], xmm1
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void UYVYToYRow_SSE2(const uint8* src_uyvy,
+ uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_uyvy
+ mov edx, [esp + 8] // dst_y
+ mov ecx, [esp + 12] // pix
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ psrlw xmm0, 8 // odd bytes are Y
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ sub ecx, 16
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_yuy2
+ mov esi, [esp + 8 + 8] // stride_yuy2
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + esi]
+ movdqa xmm3, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm2
+ pavgb xmm1, xmm3
+ pand xmm0, xmm5 // UYVY -> UVUV
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ movdqa xmm1, xmm0
+ pand xmm0, xmm5 // U
+ packuswb xmm0, xmm0
+ psrlw xmm1, 8 // V
+ packuswb xmm1, xmm1
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + edi], xmm1
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_yuy2
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pand xmm0, xmm5 // UYVY -> UVUV
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ movdqa xmm1, xmm0
+ pand xmm0, xmm5 // U
+ packuswb xmm0, xmm0
+ psrlw xmm1, 8 // V
+ packuswb xmm1, xmm1
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + edi], xmm1
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
+ uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_uyvy
+ mov edx, [esp + 8] // dst_y
+ mov ecx, [esp + 12] // pix
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ psrlw xmm0, 8 // odd bytes are Y
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ sub ecx, 16
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_yuy2
+ mov esi, [esp + 8 + 8] // stride_yuy2
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + esi]
+ movdqu xmm3, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm2
+ pavgb xmm1, xmm3
+ pand xmm0, xmm5 // UYVY -> UVUV
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ movdqa xmm1, xmm0
+ pand xmm0, xmm5 // U
+ packuswb xmm0, xmm0
+ psrlw xmm1, 8 // V
+ packuswb xmm1, xmm1
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + edi], xmm1
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_yuy2
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pand xmm0, xmm5 // UYVY -> UVUV
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ movdqa xmm1, xmm0
+ pand xmm0, xmm5 // U
+ packuswb xmm0, xmm0
+ psrlw xmm1, 8 // V
+ packuswb xmm1, xmm1
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + edi], xmm1
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+#endif // HAS_YUY2TOYROW_SSE2
+
+#ifdef HAS_ARGBBLENDROW_SSE2
+// Blend 8 pixels at a time.
+__declspec(naked) __declspec(align(16))
+void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ pcmpeqb xmm7, xmm7 // generate constant 1
+ psrlw xmm7, 15
+ pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
+ psrlw xmm6, 8
+ pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
+ psllw xmm5, 8
+ pcmpeqb xmm4, xmm4 // generate mask 0xff000000
+ pslld xmm4, 24
+
+ sub ecx, 1
+ je convertloop1 // only 1 pixel?
+ jl convertloop1b
+
+ // 1 pixel loop until destination pointer is aligned.
+ alignloop1:
+ test edx, 15 // aligned?
+ je alignloop1b
+ movd xmm3, [eax]
+ lea eax, [eax + 4]
+ movdqa xmm0, xmm3 // src argb
+ pxor xmm3, xmm4 // ~alpha
+ movd xmm2, [esi] // _r_b
+ psrlw xmm3, 8 // alpha
+ pshufhw xmm3, xmm3,0F5h // 8 alpha words
+ pshuflw xmm3, xmm3,0F5h
+ pand xmm2, xmm6 // _r_b
+ paddw xmm3, xmm7 // 256 - alpha
+ pmullw xmm2, xmm3 // _r_b * alpha
+ movd xmm1, [esi] // _a_g
+ lea esi, [esi + 4]
+ psrlw xmm1, 8 // _a_g
+ por xmm0, xmm4 // set alpha to 255
+ pmullw xmm1, xmm3 // _a_g * alpha
+ psrlw xmm2, 8 // _r_b convert to 8 bits again
+ paddusb xmm0, xmm2 // + src argb
+ pand xmm1, xmm5 // a_g_ convert to 8 bits again
+ paddusb xmm0, xmm1 // + src argb
+ sub ecx, 1
+ movd [edx], xmm0
+ lea edx, [edx + 4]
+ jge alignloop1
+
+ alignloop1b:
+ add ecx, 1 - 4
+ jl convertloop4b
+
+ // 4 pixel loop.
+ convertloop4:
+ movdqu xmm3, [eax] // src argb
+ lea eax, [eax + 16]
+ movdqa xmm0, xmm3 // src argb
+ pxor xmm3, xmm4 // ~alpha
+ movdqu xmm2, [esi] // _r_b
+ psrlw xmm3, 8 // alpha
+ pshufhw xmm3, xmm3,0F5h // 8 alpha words
+ pshuflw xmm3, xmm3,0F5h
+ pand xmm2, xmm6 // _r_b
+ paddw xmm3, xmm7 // 256 - alpha
+ pmullw xmm2, xmm3 // _r_b * alpha
+ movdqu xmm1, [esi] // _a_g
+ lea esi, [esi + 16]
+ psrlw xmm1, 8 // _a_g
+ por xmm0, xmm4 // set alpha to 255
+ pmullw xmm1, xmm3 // _a_g * alpha
+ psrlw xmm2, 8 // _r_b convert to 8 bits again
+ paddusb xmm0, xmm2 // + src argb
+ pand xmm1, xmm5 // a_g_ convert to 8 bits again
+ paddusb xmm0, xmm1 // + src argb
+ sub ecx, 4
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jge convertloop4
+
+ convertloop4b:
+ add ecx, 4 - 1
+ jl convertloop1b
+
+ // 1 pixel loop.
+ convertloop1:
+ movd xmm3, [eax] // src argb
+ lea eax, [eax + 4]
+ movdqa xmm0, xmm3 // src argb
+ pxor xmm3, xmm4 // ~alpha
+ movd xmm2, [esi] // _r_b
+ psrlw xmm3, 8 // alpha
+ pshufhw xmm3, xmm3,0F5h // 8 alpha words
+ pshuflw xmm3, xmm3,0F5h
+ pand xmm2, xmm6 // _r_b
+ paddw xmm3, xmm7 // 256 - alpha
+ pmullw xmm2, xmm3 // _r_b * alpha
+ movd xmm1, [esi] // _a_g
+ lea esi, [esi + 4]
+ psrlw xmm1, 8 // _a_g
+ por xmm0, xmm4 // set alpha to 255
+ pmullw xmm1, xmm3 // _a_g * alpha
+ psrlw xmm2, 8 // _r_b convert to 8 bits again
+ paddusb xmm0, xmm2 // + src argb
+ pand xmm1, xmm5 // a_g_ convert to 8 bits again
+ paddusb xmm0, xmm1 // + src argb
+ sub ecx, 1
+ movd [edx], xmm0
+ lea edx, [edx + 4]
+ jge convertloop1
+
+ convertloop1b:
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBBLENDROW_SSE2
+
+#ifdef HAS_ARGBBLENDROW_SSSE3
+// Shuffle table for isolating alpha.
+static const uvec8 kShuffleAlpha = {
+ 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
+ 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
+};
+// Same as SSE2, but replaces:
+// psrlw xmm3, 8 // alpha
+// pshufhw xmm3, xmm3,0F5h // 8 alpha words
+// pshuflw xmm3, xmm3,0F5h
+// with..
+// pshufb xmm3, kShuffleAlpha // alpha
+// Blend 8 pixels at a time.
+
+__declspec(naked) __declspec(align(16))
+void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ pcmpeqb xmm7, xmm7 // generate constant 1
+ psrlw xmm7, 15
+ pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
+ psrlw xmm6, 8
+ pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
+ psllw xmm5, 8
+ pcmpeqb xmm4, xmm4 // generate mask 0xff000000
+ pslld xmm4, 24
+
+ sub ecx, 1
+ je convertloop1 // only 1 pixel?
+ jl convertloop1b
+
+ // 1 pixel loop until destination pointer is aligned.
+ alignloop1:
+ test edx, 15 // aligned?
+ je alignloop1b
+ movd xmm3, [eax]
+ lea eax, [eax + 4]
+ movdqa xmm0, xmm3 // src argb
+ pxor xmm3, xmm4 // ~alpha
+ movd xmm2, [esi] // _r_b
+ pshufb xmm3, kShuffleAlpha // alpha
+ pand xmm2, xmm6 // _r_b
+ paddw xmm3, xmm7 // 256 - alpha
+ pmullw xmm2, xmm3 // _r_b * alpha
+ movd xmm1, [esi] // _a_g
+ lea esi, [esi + 4]
+ psrlw xmm1, 8 // _a_g
+ por xmm0, xmm4 // set alpha to 255
+ pmullw xmm1, xmm3 // _a_g * alpha
+ psrlw xmm2, 8 // _r_b convert to 8 bits again
+ paddusb xmm0, xmm2 // + src argb
+ pand xmm1, xmm5 // a_g_ convert to 8 bits again
+ paddusb xmm0, xmm1 // + src argb
+ sub ecx, 1
+ movd [edx], xmm0
+ lea edx, [edx + 4]
+ jge alignloop1
+
+ alignloop1b:
+ add ecx, 1 - 4
+ jl convertloop4b
+
+ test eax, 15 // unaligned?
+ jne convertuloop4
+ test esi, 15 // unaligned?
+ jne convertuloop4
+
+ // 4 pixel loop.
+ convertloop4:
+ movdqa xmm3, [eax] // src argb
+ lea eax, [eax + 16]
+ movdqa xmm0, xmm3 // src argb
+ pxor xmm3, xmm4 // ~alpha
+ movdqa xmm2, [esi] // _r_b
+ pshufb xmm3, kShuffleAlpha // alpha
+ pand xmm2, xmm6 // _r_b
+ paddw xmm3, xmm7 // 256 - alpha
+ pmullw xmm2, xmm3 // _r_b * alpha
+ movdqa xmm1, [esi] // _a_g
+ lea esi, [esi + 16]
+ psrlw xmm1, 8 // _a_g
+ por xmm0, xmm4 // set alpha to 255
+ pmullw xmm1, xmm3 // _a_g * alpha
+ psrlw xmm2, 8 // _r_b convert to 8 bits again
+ paddusb xmm0, xmm2 // + src argb
+ pand xmm1, xmm5 // a_g_ convert to 8 bits again
+ paddusb xmm0, xmm1 // + src argb
+ sub ecx, 4
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jge convertloop4
+ jmp convertloop4b
+
+ // 4 pixel unaligned loop.
+ convertuloop4:
+ movdqu xmm3, [eax] // src argb
+ lea eax, [eax + 16]
+ movdqa xmm0, xmm3 // src argb
+ pxor xmm3, xmm4 // ~alpha
+ movdqu xmm2, [esi] // _r_b
+ pshufb xmm3, kShuffleAlpha // alpha
+ pand xmm2, xmm6 // _r_b
+ paddw xmm3, xmm7 // 256 - alpha
+ pmullw xmm2, xmm3 // _r_b * alpha
+ movdqu xmm1, [esi] // _a_g
+ lea esi, [esi + 16]
+ psrlw xmm1, 8 // _a_g
+ por xmm0, xmm4 // set alpha to 255
+ pmullw xmm1, xmm3 // _a_g * alpha
+ psrlw xmm2, 8 // _r_b convert to 8 bits again
+ paddusb xmm0, xmm2 // + src argb
+ pand xmm1, xmm5 // a_g_ convert to 8 bits again
+ paddusb xmm0, xmm1 // + src argb
+ sub ecx, 4
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jge convertuloop4
+
+ convertloop4b:
+ add ecx, 4 - 1
+ jl convertloop1b
+
+ // 1 pixel loop.
+ convertloop1:
+ movd xmm3, [eax] // src argb
+ lea eax, [eax + 4]
+ movdqa xmm0, xmm3 // src argb
+ pxor xmm3, xmm4 // ~alpha
+ movd xmm2, [esi] // _r_b
+ pshufb xmm3, kShuffleAlpha // alpha
+ pand xmm2, xmm6 // _r_b
+ paddw xmm3, xmm7 // 256 - alpha
+ pmullw xmm2, xmm3 // _r_b * alpha
+ movd xmm1, [esi] // _a_g
+ lea esi, [esi + 4]
+ psrlw xmm1, 8 // _a_g
+ por xmm0, xmm4 // set alpha to 255
+ pmullw xmm1, xmm3 // _a_g * alpha
+ psrlw xmm2, 8 // _r_b convert to 8 bits again
+ paddusb xmm0, xmm2 // + src argb
+ pand xmm1, xmm5 // a_g_ convert to 8 bits again
+ paddusb xmm0, xmm1 // + src argb
+ sub ecx, 1
+ movd [edx], xmm0
+ lea edx, [edx + 4]
+ jge convertloop1
+
+ convertloop1b:
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBBLENDROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATE_SSE2
+// Attenuate 4 pixels at a time.
+// Aligned to 16 bytes.
+__declspec(naked) __declspec(align(16))
+void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb0
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ sub edx, eax
+ pcmpeqb xmm4, xmm4 // generate mask 0xff000000
+ pslld xmm4, 24
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff
+ psrld xmm5, 8
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax] // read 4 pixels
+ punpcklbw xmm0, xmm0 // first 2
+ pshufhw xmm2, xmm0,0FFh // 8 alpha words
+ pshuflw xmm2, xmm2,0FFh
+ pmulhuw xmm0, xmm2 // rgb * a
+ movdqa xmm1, [eax] // read 4 pixels
+ punpckhbw xmm1, xmm1 // next 2 pixels
+ pshufhw xmm2, xmm1,0FFh // 8 alpha words
+ pshuflw xmm2, xmm2,0FFh
+ pmulhuw xmm1, xmm2 // rgb * a
+ movdqa xmm2, [eax] // alphas
+ psrlw xmm0, 8
+ pand xmm2, xmm4
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ pand xmm0, xmm5 // keep original alphas
+ por xmm0, xmm2
+ sub ecx, 4
+ movdqa [eax + edx], xmm0
+ lea eax, [eax + 16]
+ jg convertloop
+
+ ret
+ }
+}
+#endif // HAS_ARGBATTENUATE_SSE2
+
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+// Shuffle table duplicating alpha.
+static const uvec8 kShuffleAlpha0 = {
+ 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
+};
+static const uvec8 kShuffleAlpha1 = {
+ 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+ 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
+};
+__declspec(naked) __declspec(align(16))
+void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
__asm {
- pushad
- mov edx, [esp + 32 + 4]
- mov edi, [esp + 32 + 8]
- mov esi, [esp + 32 + 12]
- mov ebp, [esp + 32 + 16]
- mov ecx, [esp + 32 + 20]
-
- convertloop :
- movzx eax, byte ptr [edi]
- lea edi, [edi + 1]
- movzx ebx, byte ptr [esi]
- lea esi, [esi + 1]
- movq mm0, [_kCoefficientsBgraY + 2048 + 8 * eax]
- movzx eax, byte ptr [edx]
- paddsw mm0, [_kCoefficientsBgraY + 4096 + 8 * ebx]
- movzx ebx, byte ptr [edx + 1]
- movq mm1, [_kCoefficientsBgraY + 8 * eax]
- lea edx, [edx + 2]
- movq mm2, [_kCoefficientsBgraY + 8 * ebx]
- paddsw mm1, mm0
- paddsw mm2, mm0
- psraw mm1, 6
- psraw mm2, 6
- packuswb mm1, mm2
- movntq [ebp], mm1
- lea ebp, [ebp + 8]
- sub ecx, 2
- ja convertloop
-
- popad
- ret
- }
-}
-
-__declspec(naked)
-void FastConvertYUVToABGRRow(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
+ mov eax, [esp + 4] // src_argb0
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ sub edx, eax
+ pcmpeqb xmm3, xmm3 // generate mask 0xff000000
+ pslld xmm3, 24
+ movdqa xmm4, kShuffleAlpha0
+ movdqa xmm5, kShuffleAlpha1
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax] // read 4 pixels
+ pshufb xmm0, xmm4 // isolate first 2 alphas
+ movdqa xmm1, [eax] // read 4 pixels
+ punpcklbw xmm1, xmm1 // first 2 pixel rgbs
+ pmulhuw xmm0, xmm1 // rgb * a
+ movdqa xmm1, [eax] // read 4 pixels
+ pshufb xmm1, xmm5 // isolate next 2 alphas
+ movdqa xmm2, [eax] // read 4 pixels
+ punpckhbw xmm2, xmm2 // next 2 pixel rgbs
+ pmulhuw xmm1, xmm2 // rgb * a
+ movdqa xmm2, [eax] // mask original alpha
+ pand xmm2, xmm3
+ psrlw xmm0, 8
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ por xmm0, xmm2 // copy original alpha
+ sub ecx, 4
+ movdqa [eax + edx], xmm0
+ lea eax, [eax + 16]
+ jg convertloop
+
+ ret
+ }
+}
+#endif // HAS_ARGBATTENUATEROW_SSSE3
+
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+// Unattenuate 4 pixels at a time.
+// Aligned to 16 bytes.
+__declspec(naked) __declspec(align(16))
+void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
int width) {
__asm {
- pushad
- mov edx, [esp + 32 + 4]
- mov edi, [esp + 32 + 8]
- mov esi, [esp + 32 + 12]
- mov ebp, [esp + 32 + 16]
- mov ecx, [esp + 32 + 20]
-
- convertloop :
- movzx eax, byte ptr [edi]
- lea edi, [edi + 1]
- movzx ebx, byte ptr [esi]
- lea esi, [esi + 1]
- movq mm0, [_kCoefficientsAbgrY + 2048 + 8 * eax]
- movzx eax, byte ptr [edx]
- paddsw mm0, [_kCoefficientsAbgrY + 4096 + 8 * ebx]
- movzx ebx, byte ptr [edx + 1]
- movq mm1, [_kCoefficientsAbgrY + 8 * eax]
- lea edx, [edx + 2]
- movq mm2, [_kCoefficientsAbgrY + 8 * ebx]
- paddsw mm1, mm0
- paddsw mm2, mm0
- psraw mm1, 6
- psraw mm2, 6
- packuswb mm1, mm2
- movntq [ebp], mm1
- lea ebp, [ebp + 8]
- sub ecx, 2
- ja convertloop
-
- popad
- ret
- }
-}
-
-__declspec(naked)
-void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) {
- __asm {
- pushad
- mov edx, [esp + 32 + 4] // Y
- mov edi, [esp + 32 + 8] // U
- mov esi, [esp + 32 + 12] // V
- mov ebp, [esp + 32 + 16] // rgb
- mov ecx, [esp + 32 + 20] // width
-
- convertloop :
- movzx eax, byte ptr [edi]
- lea edi, [edi + 1]
- movzx ebx, byte ptr [esi]
- lea esi, [esi + 1]
- movq mm0, [_kCoefficientsRgbY + 2048 + 8 * eax]
- movzx eax, byte ptr [edx]
- paddsw mm0, [_kCoefficientsRgbY + 4096 + 8 * ebx]
- lea edx, [edx + 1]
- paddsw mm0, [_kCoefficientsRgbY + 8 * eax]
- psraw mm0, 6
- packuswb mm0, mm0
- movd [ebp], mm0
- lea ebp, [ebp + 4]
- sub ecx, 1
- ja convertloop
-
- popad
- ret
- }
-}
-
-__declspec(naked)
-void FastConvertYToRGB32Row(const uint8* y_buf,
- uint8* rgb_buf,
- int width) {
- __asm {
- push ebx
- mov eax, [esp + 4 + 4] // Y
- mov edx, [esp + 4 + 8] // rgb
- mov ecx, [esp + 4 + 12] // width
-
- convertloop :
- movzx ebx, byte ptr [eax]
- movq mm0, [_kCoefficientsRgbY + 8 * ebx]
- psraw mm0, 6
- movzx ebx, byte ptr [eax + 1]
- movq mm1, [_kCoefficientsRgbY + 8 * ebx]
- psraw mm1, 6
- packuswb mm0, mm1
- lea eax, [eax + 2]
- movq [edx], mm0
- lea edx, [edx + 8]
- sub ecx, 2
- ja convertloop
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb0
+ mov edx, [esp + 8 + 8] // dst_argb
+ mov ecx, [esp + 8 + 12] // width
+ sub edx, eax
+ pcmpeqb xmm4, xmm4 // generate mask 0xff000000
+ pslld xmm4, 24
- pop ebx
+ align 16
+ convertloop:
+ movdqa xmm0, [eax] // read 4 pixels
+ movzx esi, byte ptr [eax + 3] // first alpha
+ movzx edi, byte ptr [eax + 7] // second alpha
+ punpcklbw xmm0, xmm0 // first 2
+ movd xmm2, dword ptr fixed_invtbl8[esi * 4]
+ movd xmm3, dword ptr fixed_invtbl8[edi * 4]
+ pshuflw xmm2, xmm2,0C0h // first 4 inv_alpha words
+ pshuflw xmm3, xmm3,0C0h // next 4 inv_alpha words
+ movlhps xmm2, xmm3
+ pmulhuw xmm0, xmm2 // rgb * a
+
+ movdqa xmm1, [eax] // read 4 pixels
+ movzx esi, byte ptr [eax + 11] // third alpha
+ movzx edi, byte ptr [eax + 15] // forth alpha
+ punpckhbw xmm1, xmm1 // next 2
+ movd xmm2, dword ptr fixed_invtbl8[esi * 4]
+ movd xmm3, dword ptr fixed_invtbl8[edi * 4]
+ pshuflw xmm2, xmm2,0C0h // first 4 inv_alpha words
+ pshuflw xmm3, xmm3,0C0h // next 4 inv_alpha words
+ movlhps xmm2, xmm3
+ pmulhuw xmm1, xmm2 // rgb * a
+
+ movdqa xmm2, [eax] // alphas
+ pand xmm2, xmm4
+ packuswb xmm0, xmm1
+ por xmm0, xmm2
+ sub ecx, 4
+ movdqa [eax + edx], xmm0
+ lea eax, [eax + 16]
+ jg convertloop
+ pop edi
+ pop esi
ret
}
}
+#endif // HAS_ARGBUNATTENUATEROW_SSE2
-#endif
+#ifdef HAS_ARGBGRAYROW_SSSE3
+// Constant for ARGB color to gray scale: 0.11 * B + 0.59 * G + 0.30 * R
+static const vec8 kARGBToGray = {
+ 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
+};
+
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
+__declspec(naked) __declspec(align(16))
+void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_argb */
+ mov ecx, [esp + 12] /* width */
+ movdqa xmm4, kARGBToGray
+ sub edx, eax
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax] // G
+ movdqa xmm1, [eax + 16]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ phaddw xmm0, xmm1
+ psrlw xmm0, 7
+ packuswb xmm0, xmm0 // 8 G bytes
+ movdqa xmm2, [eax] // A
+ movdqa xmm3, [eax + 16]
+ psrld xmm2, 24
+ psrld xmm3, 24
+ packuswb xmm2, xmm3
+ packuswb xmm2, xmm2 // 8 A bytes
+ movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA
+ punpcklbw xmm0, xmm0 // 8 GG words
+ punpcklbw xmm3, xmm2 // 8 GA words
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm3 // GGGA first 4
+ punpckhwd xmm1, xmm3 // GGGA next 4
+ sub ecx, 8
+ movdqa [eax + edx], xmm0
+ movdqa [eax + edx + 16], xmm1
+ lea eax, [eax + 32]
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_ARGBGRAYROW_SSSE3
+
+#ifdef HAS_ARGBSEPIAROW_SSSE3
+// b = (r * 35 + g * 68 + b * 17) >> 7
+// g = (r * 45 + g * 88 + b * 22) >> 7
+// r = (r * 50 + g * 98 + b * 24) >> 7
+// Constant for ARGB color to sepia tone.
+static const vec8 kARGBToSepiaB = {
+ 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
+};
+
+static const vec8 kARGBToSepiaG = {
+ 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
+};
+
+static const vec8 kARGBToSepiaR = {
+ 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
+};
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+__declspec(naked) __declspec(align(16))
+void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
+ __asm {
+ mov eax, [esp + 4] /* dst_argb */
+ mov ecx, [esp + 8] /* width */
+ movdqa xmm2, kARGBToSepiaB
+ movdqa xmm3, kARGBToSepiaG
+ movdqa xmm4, kARGBToSepiaR
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax] // B
+ movdqa xmm6, [eax + 16]
+ pmaddubsw xmm0, xmm2
+ pmaddubsw xmm6, xmm2
+ phaddw xmm0, xmm6
+ psrlw xmm0, 7
+ packuswb xmm0, xmm0 // 8 B values
+ movdqa xmm5, [eax] // G
+ movdqa xmm1, [eax + 16]
+ pmaddubsw xmm5, xmm3
+ pmaddubsw xmm1, xmm3
+ phaddw xmm5, xmm1
+ psrlw xmm5, 7
+ packuswb xmm5, xmm5 // 8 G values
+ punpcklbw xmm0, xmm5 // 8 BG values
+ movdqa xmm5, [eax] // R
+ movdqa xmm1, [eax + 16]
+ pmaddubsw xmm5, xmm4
+ pmaddubsw xmm1, xmm4
+ phaddw xmm5, xmm1
+ psrlw xmm5, 7
+ packuswb xmm5, xmm5 // 8 R values
+ movdqa xmm6, [eax] // A
+ movdqa xmm1, [eax + 16]
+ psrld xmm6, 24
+ psrld xmm1, 24
+ packuswb xmm6, xmm1
+ packuswb xmm6, xmm6 // 8 A values
+ punpcklbw xmm5, xmm6 // 8 RA values
+ movdqa xmm1, xmm0 // Weave BG, RA together
+ punpcklwd xmm0, xmm5 // BGRA first 4
+ punpckhwd xmm1, xmm5 // BGRA next 4
+ sub ecx, 8
+ movdqa [eax], xmm0
+ movdqa [eax + 16], xmm1
+ lea eax, [eax + 32]
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_ARGBSEPIAROW_SSSE3
+
+#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// Same as Sepia except matrix is provided.
+// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
+// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
+__declspec(naked) __declspec(align(16))
+void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* dst_argb */
+ mov edx, [esp + 8] /* matrix_argb */
+ mov ecx, [esp + 12] /* width */
+ movd xmm2, [edx]
+ movd xmm3, [edx + 4]
+ movd xmm4, [edx + 8]
+ pshufd xmm2, xmm2, 0
+ pshufd xmm3, xmm3, 0
+ pshufd xmm4, xmm4, 0
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax] // B
+ movdqa xmm6, [eax + 16]
+ pmaddubsw xmm0, xmm2
+ pmaddubsw xmm6, xmm2
+ movdqa xmm5, [eax] // G
+ movdqa xmm1, [eax + 16]
+ pmaddubsw xmm5, xmm3
+ pmaddubsw xmm1, xmm3
+ phaddsw xmm0, xmm6 // B
+ phaddsw xmm5, xmm1 // G
+ psraw xmm0, 7 // B
+ psraw xmm5, 7 // G
+ packuswb xmm0, xmm0 // 8 B values
+ packuswb xmm5, xmm5 // 8 G values
+ punpcklbw xmm0, xmm5 // 8 BG values
+ movdqa xmm5, [eax] // R
+ movdqa xmm1, [eax + 16]
+ pmaddubsw xmm5, xmm4
+ pmaddubsw xmm1, xmm4
+ phaddsw xmm5, xmm1
+ psraw xmm5, 7
+ packuswb xmm5, xmm5 // 8 R values
+ movdqa xmm6, [eax] // A
+ movdqa xmm1, [eax + 16]
+ psrld xmm6, 24
+ psrld xmm1, 24
+ packuswb xmm6, xmm1
+ packuswb xmm6, xmm6 // 8 A values
+ movdqa xmm1, xmm0 // Weave BG, RA together
+ punpcklbw xmm5, xmm6 // 8 RA values
+ punpcklwd xmm0, xmm5 // BGRA first 4
+ punpckhwd xmm1, xmm5 // BGRA next 4
+ sub ecx, 8
+ movdqa [eax], xmm0
+ movdqa [eax + 16], xmm1
+ lea eax, [eax + 32]
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
+
+#ifdef HAS_ARGBCOLORTABLEROW_X86
+// Tranform ARGB pixels with color table.
+__declspec(naked) __declspec(align(16))
+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
+ int width) {
+ __asm {
+ push ebx
+ push esi
+ push edi
+ push ebp
+ mov eax, [esp + 16 + 4] /* dst_argb */
+ mov edi, [esp + 16 + 8] /* table_argb */
+ mov ecx, [esp + 16 + 12] /* width */
+ xor ebx, ebx
+ xor edx, edx
+
+ align 16
+ convertloop:
+ mov ebp, dword ptr [eax] // BGRA
+ mov esi, ebp
+ and ebp, 255
+ shr esi, 8
+ and esi, 255
+ mov bl, [edi + ebp * 4 + 0] // B
+ mov dl, [edi + esi * 4 + 1] // G
+ mov ebp, dword ptr [eax] // BGRA
+ mov esi, ebp
+ shr ebp, 16
+ shr esi, 24
+ and ebp, 255
+ mov [eax], bl
+ mov [eax + 1], dl
+ mov bl, [edi + ebp * 4 + 2] // R
+ mov dl, [edi + esi * 4 + 3] // A
+ mov [eax + 2], bl
+ mov [eax + 3], dl
+ lea eax, [eax + 4]
+ sub ecx, 1
+ jg convertloop
+ pop ebp
+ pop edi
+ pop esi
+ pop ebx
+ ret
+ }
+}
+#endif // HAS_ARGBCOLORTABLEROW_X86
+
+#ifdef HAS_ARGBQUANTIZEROW_SSE2
+// Quantize 4 ARGB pixels (16 bytes).
+// Aligned to 16 bytes.
+__declspec(naked) __declspec(align(16))
+void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
+ int interval_offset, int width) {
+ __asm {
+ mov eax, [esp + 4] /* dst_argb */
+ movd xmm2, [esp + 8] /* scale */
+ movd xmm3, [esp + 12] /* interval_size */
+ movd xmm4, [esp + 16] /* interval_offset */
+ mov ecx, [esp + 20] /* width */
+ pshuflw xmm2, xmm2, 040h
+ pshufd xmm2, xmm2, 044h
+ pshuflw xmm3, xmm3, 040h
+ pshufd xmm3, xmm3, 044h
+ pshuflw xmm4, xmm4, 040h
+ pshufd xmm4, xmm4, 044h
+ pxor xmm5, xmm5 // constant 0
+ pcmpeqb xmm6, xmm6 // generate mask 0xff000000
+ pslld xmm6, 24
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax] // read 4 pixels
+ punpcklbw xmm0, xmm5 // first 2 pixels
+ pmulhuw xmm0, xmm2 // pixel * scale >> 16
+ movdqa xmm1, [eax] // read 4 pixels
+ punpckhbw xmm1, xmm5 // next 2 pixels
+ pmulhuw xmm1, xmm2
+ pmullw xmm0, xmm3 // * interval_size
+ movdqa xmm7, [eax] // read 4 pixels
+ pmullw xmm1, xmm3
+ pand xmm7, xmm6 // mask alpha
+ paddw xmm0, xmm4 // + interval_size / 2
+ paddw xmm1, xmm4
+ packuswb xmm0, xmm1
+ por xmm0, xmm7
+ sub ecx, 4
+ movdqa [eax], xmm0
+ lea eax, [eax + 16]
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_ARGBQUANTIZEROW_SSE2
+
+#ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
+// Consider float CumulativeSum.
+// Consider calling CumulativeSum one row at time as needed.
+// Consider circular CumulativeSum buffer of radius * 2 + 1 height.
+// Convert cumulative sum for an area to an average for 1 pixel.
+// topleft is pointer to top left of CumulativeSum buffer for area.
+// botleft is pointer to bottom left of CumulativeSum buffer.
+// width is offset from left to right of area in CumulativeSum buffer measured
+// in number of ints.
+// area is the number of pixels in the area being averaged.
+// dst points to pixel to store result to.
+// count is number of averaged pixels to produce.
+// Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte
+// aligned.
+void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
+ int width, int area, uint8* dst, int count) {
+ __asm {
+ mov eax, topleft // eax topleft
+ mov esi, botleft // esi botleft
+ mov edx, width
+ movd xmm4, area
+ mov edi, dst
+ mov ecx, count
+ cvtdq2ps xmm4, xmm4
+ rcpss xmm4, xmm4 // 1.0f / area
+ pshufd xmm4, xmm4, 0
+ sub ecx, 4
+ jl l4b
+
+ // 4 pixel loop
+ align 4
+ l4:
+ // top left
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+
+ // - top right
+ psubd xmm0, [eax + edx * 4]
+ psubd xmm1, [eax + edx * 4 + 16]
+ psubd xmm2, [eax + edx * 4 + 32]
+ psubd xmm3, [eax + edx * 4 + 48]
+ lea eax, [eax + 64]
+
+ // - bottom left
+ psubd xmm0, [esi]
+ psubd xmm1, [esi + 16]
+ psubd xmm2, [esi + 32]
+ psubd xmm3, [esi + 48]
+
+ // + bottom right
+ paddd xmm0, [esi + edx * 4]
+ paddd xmm1, [esi + edx * 4 + 16]
+ paddd xmm2, [esi + edx * 4 + 32]
+ paddd xmm3, [esi + edx * 4 + 48]
+ lea esi, [esi + 64]
+
+ cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area
+ cvtdq2ps xmm1, xmm1
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ cvtdq2ps xmm2, xmm2
+ cvtdq2ps xmm3, xmm3
+ mulps xmm2, xmm4
+ mulps xmm3, xmm4
+ cvtps2dq xmm0, xmm0
+ cvtps2dq xmm1, xmm1
+ cvtps2dq xmm2, xmm2
+ cvtps2dq xmm3, xmm3
+ packssdw xmm0, xmm1
+ packssdw xmm2, xmm3
+ packuswb xmm0, xmm2
+ movdqu [edi], xmm0
+ lea edi, [edi + 16]
+ sub ecx, 4
+ jge l4
+
+ l4b:
+ add ecx, 4 - 1
+ jl l1b
+
+ // 1 pixel loop
+ align 4
+ l1:
+ movdqa xmm0, [eax]
+ psubd xmm0, [eax + edx * 4]
+ lea eax, [eax + 16]
+ psubd xmm0, [esi]
+ paddd xmm0, [esi + edx * 4]
+ lea esi, [esi + 16]
+ cvtdq2ps xmm0, xmm0
+ mulps xmm0, xmm4
+ cvtps2dq xmm0, xmm0
+ packssdw xmm0, xmm0
+ packuswb xmm0, xmm0
+ movd dword ptr [edi], xmm0
+ lea edi, [edi + 4]
+ sub ecx, 1
+ jge l1
+ l1b:
+ }
+}
+#endif // HAS_CUMULATIVESUMTOAVERAGE_SSE2
+
+#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
+// Creates a table of cumulative sums where each value is a sum of all values
+// above and to the left of the value.
+void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
+ const int32* previous_cumsum, int width) {
+ __asm {
+ mov eax, row
+ mov edx, cumsum
+ mov esi, previous_cumsum
+ mov ecx, width
+ sub esi, edx
+ pxor xmm0, xmm0
+ pxor xmm1, xmm1
+
+ sub ecx, 4
+ jl l4b
+ test edx, 15
+ jne l4b
+
+ // 4 pixel loop
+ align 4
+ l4:
+ movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
+ lea eax, [eax + 16]
+ movdqa xmm4, xmm2
+
+ punpcklbw xmm2, xmm1
+ movdqa xmm3, xmm2
+ punpcklwd xmm2, xmm1
+ punpckhwd xmm3, xmm1
+
+ punpckhbw xmm4, xmm1
+ movdqa xmm5, xmm4
+ punpcklwd xmm4, xmm1
+ punpckhwd xmm5, xmm1
+
+ paddd xmm0, xmm2
+ movdqa xmm2, [edx + esi] // previous row above.
+ paddd xmm2, xmm0
+
+ paddd xmm0, xmm3
+ movdqa xmm3, [edx + esi + 16]
+ paddd xmm3, xmm0
+ paddd xmm0, xmm4
+ movdqa xmm4, [edx + esi + 32]
+ paddd xmm4, xmm0
+
+ paddd xmm0, xmm5
+ movdqa xmm5, [edx + esi + 48]
+ paddd xmm5, xmm0
+
+ movdqa [edx], xmm2
+ movdqa [edx + 16], xmm3
+ movdqa [edx + 32], xmm4
+ movdqa [edx + 48], xmm5
+
+ lea edx, [edx + 64]
+ sub ecx, 4
+ jge l4
+
+ l4b:
+ add ecx, 4 - 1
+ jl l1b
+
+ // 1 pixel loop
+ align 4
+ l1:
+ movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes.
+ lea eax, [eax + 4]
+ punpcklbw xmm2, xmm1
+ punpcklwd xmm2, xmm1
+ paddd xmm0, xmm2
+ movdqu xmm2, [edx + esi]
+ paddd xmm2, xmm0
+ movdqu [edx], xmm2
+ lea edx, [edx + 16]
+ sub ecx, 1
+ jge l1
+
+ l1b:
+ }
+}
+#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
+
+#ifdef HAS_ARGBSHADE_SSE2
+// Shade 4 pixels at a time by specified value.
+// Aligned to 16 bytes.
+__declspec(naked) __declspec(align(16))
+void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
+ uint32 value) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ movd xmm2, [esp + 16] // value
+ sub edx, eax
+ punpcklbw xmm2, xmm2
+ punpcklqdq xmm2, xmm2
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax] // read 4 pixels
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm0 // first 2
+ punpckhbw xmm1, xmm1 // next 2
+ pmulhuw xmm0, xmm2 // argb * value
+ pmulhuw xmm1, xmm2 // argb * value
+ psrlw xmm0, 8
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ sub ecx, 4
+ movdqa [eax + edx], xmm0
+ lea eax, [eax + 16]
+ jg convertloop
+
+ ret
+ }
+}
+#endif // HAS_ARGBSHADE_SSE2
+
+#ifdef HAS_ARGBAFFINEROW_SSE2
+// Copy ARGB pixels from source image with slope to a row of destination.
+__declspec(naked) __declspec(align(16))
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+ uint8* dst_argb, const float* uv_dudv, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 12] // src_argb
+ mov esi, [esp + 16] // stride
+ mov edx, [esp + 20] // dst_argb
+ mov ecx, [esp + 24] // pointer to uv_dudv
+ movq xmm2, qword ptr [ecx] // uv
+ movq xmm7, qword ptr [ecx + 8] // dudv
+ mov ecx, [esp + 28] // width
+ shl esi, 16 // 4, stride
+ add esi, 4
+ movd xmm5, esi
+ sub ecx, 4
+ jl l4b
+
+ // setup for 4 pixel loop
+ pshufd xmm7, xmm7, 0x44 // dup dudv
+ pshufd xmm5, xmm5, 0 // dup 4, stride
+ movdqa xmm0, xmm2 // x0, y0, x1, y1
+ addps xmm0, xmm7
+ movlhps xmm2, xmm0
+ movdqa xmm4, xmm7
+ addps xmm4, xmm4 // dudv *= 2
+ movdqa xmm3, xmm2 // x2, y2, x3, y3
+ addps xmm3, xmm4
+ addps xmm4, xmm4 // dudv *= 4
+
+ // 4 pixel loop
+ align 4
+ l4:
+ cvttps2dq xmm0, xmm2 // x, y float to int first 2
+ cvttps2dq xmm1, xmm3 // x, y float to int next 2
+ packssdw xmm0, xmm1 // x, y as 8 shorts
+ pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride.
+ movd esi, xmm0
+ pshufd xmm0, xmm0, 0x39 // shift right
+ movd edi, xmm0
+ pshufd xmm0, xmm0, 0x39 // shift right
+ movd xmm1, [eax + esi] // read pixel 0
+ movd xmm6, [eax + edi] // read pixel 1
+ punpckldq xmm1, xmm6 // combine pixel 0 and 1
+ addps xmm2, xmm4 // x, y += dx, dy first 2
+ movq qword ptr [edx], xmm1
+ movd esi, xmm0
+ pshufd xmm0, xmm0, 0x39 // shift right
+ movd edi, xmm0
+ movd xmm6, [eax + esi] // read pixel 2
+ movd xmm0, [eax + edi] // read pixel 3
+ punpckldq xmm6, xmm0 // combine pixel 2 and 3
+ addps xmm3, xmm4 // x, y += dx, dy next 2
+ sub ecx, 4
+ movq qword ptr 8[edx], xmm6
+ lea edx, [edx + 16]
+ jge l4
+
+ l4b:
+ add ecx, 4 - 1
+ jl l1b
+
+ // 1 pixel loop
+ align 4
+ l1:
+ cvttps2dq xmm0, xmm2 // x, y float to int
+ packssdw xmm0, xmm0 // x, y as shorts
+ pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride
+ addps xmm2, xmm7 // x, y += dx, dy
+ movd esi, xmm0
+ movd xmm0, [eax + esi] // copy a pixel
+ sub ecx, 1
+ movd [edx], xmm0
+ lea edx, [edx + 4]
+ jge l1
+ l1b:
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBAFFINEROW_SSE2
+
+// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version.
+__declspec(naked) __declspec(align(16))
+void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
+ __asm {
+ push esi
+ push edi
+ mov edi, [esp + 8 + 4] // dst_ptr
+ mov esi, [esp + 8 + 8] // src_ptr
+ mov edx, [esp + 8 + 12] // src_stride
+ mov ecx, [esp + 8 + 16] // dst_width
+ mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
+ sub edi, esi
+ shr eax, 1
+ cmp eax, 0
+ je xloop1
+ cmp eax, 64
+ je xloop2
+ movd xmm0, eax // high fraction 0..127
+ neg eax
+ add eax, 128
+ movd xmm5, eax // low fraction 128..1
+ punpcklbw xmm5, xmm0
+ punpcklwd xmm5, xmm5
+ pshufd xmm5, xmm5, 0
+
+ align 16
+ xloop:
+ movdqa xmm0, [esi]
+ movdqa xmm2, [esi + edx]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm2
+ punpckhbw xmm1, xmm2
+ pmaddubsw xmm0, xmm5
+ pmaddubsw xmm1, xmm5
+ psrlw xmm0, 7
+ psrlw xmm1, 7
+ packuswb xmm0, xmm1
+ sub ecx, 4
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop
+
+ pop edi
+ pop esi
+ ret
+
+ align 16
+ xloop1:
+ movdqa xmm0, [esi]
+ sub ecx, 4
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop1
+
+ pop edi
+ pop esi
+ ret
+
+ align 16
+ xloop2:
+ movdqa xmm0, [esi]
+ pavgb xmm0, [esi + edx]
+ sub ecx, 4
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop2
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+#endif // _M_IX86
+
+#ifdef __cplusplus
} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/scale.cc b/files/source/scale.cc
index d3b7d333..38910c91 100644
--- a/files/source/scale.cc
+++ b/files/source/scale.cc
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@@ -12,34 +12,37 @@
#include <assert.h>
#include <string.h>
+#include <stdlib.h> // For getenv()
#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h" // For CopyPlane
+#include "libyuv/row.h"
-#if defined(_MSC_VER)
-#define ALIGN16(var) __declspec(align(16)) var
-#else
-#define ALIGN16(var) var __attribute__((aligned(16)))
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
#endif
-// Note: A Neon reference manual
-// http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/CJAJIIGG.html
+// Bilinear SSE2 is disabled.
+#define SSE2_DISABLED 1
+
// Note: Some SSE2 reference manuals
// cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf
-namespace libyuv {
-
// Set the following flag to true to revert to only
// using the reference implementation ScalePlaneBox(), and
// NOT the optimized versions. Useful for debugging and
// when comparing the quality of the resulting YUV planes
// as produced by the optimized and non-optimized versions.
-
static bool use_reference_impl_ = false;
+LIBYUV_API
void SetUseReferenceImpl(bool use) {
use_reference_impl_ = use;
}
+// ScaleRowDown2Int also used by planar functions
+
/**
* NEON downscalers with interpolation.
*
@@ -47,126 +50,53 @@ void SetUseReferenceImpl(bool use) {
*
*/
-#if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED)
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
#define HAS_SCALEROWDOWN2_NEON
-void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */,
- uint8* dst, int dst_width) {
- __asm__ volatile
- (
- "1:\n"
- "vld2.u8 {q0,q1}, [%0]! \n" // load even pixels into q0, odd into q1
- "vst1.u8 {q0}, [%1]! \n" // store even pixels
- "subs %2, %2, #16 \n" // 16 processed per loop
- "bhi 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst), // %1
- "+r"(dst_width) // %2
- :
- : "q0", "q1" // Clobber List
- );
-}
+// Note - not static due to reuse in convert for 444 to 420.
+void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst, int dst_width);
-void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride,
- uint8* dst, int dst_width) {
- __asm__ volatile
- (
- "mov r4, #2 \n" // rounding constant
- "add %1, %0 \n" // change the stride to row 2 pointer
- "vdup.16 q4, r4 \n"
- "1:\n"
- "vld1.u8 {q0,q1}, [%0]! \n" // load row 1 and post increment
- "vld1.u8 {q2,q3}, [%1]! \n" // load row 2 and post increment
- "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
- "vpaddl.u8 q1, q1 \n"
- "vpadal.u8 q0, q2 \n" // row 2 add adjacent, add row 1 to row 2
- "vpadal.u8 q1, q3 \n"
- "vadd.u16 q0, q4 \n" // rounding
- "vadd.u16 q1, q4 \n"
- "vshrn.u16 d0, q0, #2 \n" // downshift and pack
- "vshrn.u16 d1, q1, #2 \n"
- "vst1.u8 {q0}, [%2]! \n"
- "subs %3, %3, #16 \n" // 16 processed per loop
- "bhi 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(src_stride), // %1
- "+r"(dst), // %2
- "+r"(dst_width) // %3
- :
- : "r4", "q0", "q1", "q2", "q3", "q4" // Clobber List
- );
-}
+void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
#define HAS_SCALEROWDOWN4_NEON
-// Expecting widths on arm devices to be smaller. Went with 8x4 blocks
-// to get most coverage. Look to back and evaluate 16x4 blocks with
-// handling of leftovers.
-static void ScaleRowDown4_NEON(const uint8* src_ptr, int /* src_stride */,
- uint8* dst_ptr, int dst_width) {
- __asm__ volatile
- (
- "mov r4, #4 \n"
- "1: \n"
- "vld1.u8 {d0[0]}, [%0],r4 \n" // load up only 2 pixels of data to
- "vld1.u8 {d0[1]}, [%0],r4 \n" // represent the entire 8x4 block
-
- "vst1.u16 {d0[0]}, [%1]! \n"
-
- "subs %2, #2 \n" // dst_width -= 2
- "bhi 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "r4", "q0", "q1", "memory", "cc"
- );
-}
+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
-static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm__ volatile
- (
- "1: \n"
- "mov r4, %0 \n"
- "vld1.u8 {d0}, [r4],%3 \n" // load up 8x4 block of input data
- "vld1.u8 {d1}, [r4],%3 \n"
- "vld1.u8 {d2}, [r4],%3 \n"
- "vld1.u8 {d3}, [r4] \n"
-
- // data is loaded up int q0 and q1
- // q0 = a00 a01 a02 a03 b00 b01 b02 b03 a10 a11 a12 a13 b10 b11 b12 b13
- // q1 = a20 a21 a22 a23 b20 b21 b22 b23 a20 a21 a22 a23 b20 b21 b22 b23
- // q0 = a00+a01 a02+a03 b00+b01 b02+b03 a10+a11 a12+a13 b10+b11 b12+b13
- "vpaddl.u8 q0, q0 \n"
-
- // d0 = a00+a01+a20+a21 a02+a03+a22+a23 b00+b01+b20+b21 b02+b03+b22+b23
- // d1 = a10+a11+a20+a21 a12+a13+a22+a23 b10+b11+b20+b21 b12+b13+b22+b23
- "vpadal.u8 q0, q1 \n"
-
- // d0 = a00+a01+a20+a21+a02+a03+a22+a23 b00+b01+b20+b21+b02+b03+b22+b23
- // d1 = a10+a11+a20+a21+a12+a13+a22+a23 b10+b11+b20+b21+b12+b13+b22+b23
- "vpaddl.u16 q0, q0 \n"
-
-
- // d0 = a00+a01+a20+a21+a02+a03+a22+a23+a10+a11+a20+a21+a12+a13+a22+a23
- // b00+b01+b20+b21+b02+b03+b22+b23+b10+b11+b20+b21+b12+b13+b22+b23
- "vadd.u32 d0, d1 \n"
-
- "vrshr.u32 d0, d0, #4 \n" // divide by 16 w/rounding
-
- "vst1.u8 {d0[0]}, [%1]! \n"
- "vst1.u8 {d0[4]}, [%1]! \n"
-
- "add %0, #8 \n" // move src pointer to next 8 pixels
- "subs %2, #2 \n" // dst_width -= 2
- "bhi 1b \n"
-
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"(src_stride) // %3
- : "r4", "q0", "q1", "memory", "cc"
- );
-}
+#define HAS_SCALEROWDOWN34_NEON
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8* src_ptr,
+ ptrdiff_t /* src_stride */,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+
+#define HAS_SCALEROWDOWN38_NEON
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8* src_ptr,
+ ptrdiff_t /* src_stride */,
+ uint8* dst_ptr, int dst_width);
+// 32x3 -> 12x1
+void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+// 16x2 -> 16x1
+#define HAS_SCALEFILTERROWS_NEON
+void ScaleFilterRows_NEON(uint8* dst_ptr,
+ const uint8* src_ptr, ptrdiff_t src_stride,
+ int dst_width, int source_y_fraction);
/**
* SSE2 downscalers with interpolation.
@@ -175,137 +105,141 @@ static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride,
*
*/
-// Constants for SSE2 code
-#elif (defined(WIN32) || defined(__i386__) || defined(__x86_64__)) && \
- !defined(COVERAGE_ENABLED) && !TARGET_IPHONE_SIMULATOR
-#if defined(_MSC_VER)
-#define TALIGN16(t, var) __declspec(align(16)) t _ ## var
-#elif defined(OSX)
-#define TALIGN16(t, var) t var __attribute__((aligned(16)))
+
+// Constants for SSSE3 code
+#elif !defined(YUV_DISABLE_ASM) && \
+ (defined(_M_IX86) || defined(__i386__) || defined(__x86_64__))
+
+// GCC 4.2 on OSX has link error when passing static or const to inline.
+// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
+#ifdef __APPLE__
+#define CONST
#else
-#define TALIGN16(t, var) t _ ## var __attribute__((aligned(16)))
+#define CONST static const
#endif
// Offsets for source bytes 0 to 9
-extern "C" TALIGN16(const uint8, shuf0[16]) =
+CONST uvec8 kShuf0 =
{ 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
-extern "C" TALIGN16(const uint8, shuf1[16]) =
+CONST uvec8 kShuf1 =
{ 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-extern "C" TALIGN16(const uint8, shuf2[16]) =
+CONST uvec8 kShuf2 =
{ 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
// Offsets for source bytes 0 to 10
-extern "C" TALIGN16(const uint8, shuf01[16]) =
+CONST uvec8 kShuf01 =
{ 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
-extern "C" TALIGN16(const uint8, shuf11[16]) =
+CONST uvec8 kShuf11 =
{ 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-extern "C" TALIGN16(const uint8, shuf21[16]) =
+CONST uvec8 kShuf21 =
{ 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
// Coefficients for source bytes 0 to 10
-extern "C" TALIGN16(const uint8, madd01[16]) =
+CONST uvec8 kMadd01 =
{ 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
// Coefficients for source bytes 10 to 21
-extern "C" TALIGN16(const uint8, madd11[16]) =
+CONST uvec8 kMadd11 =
{ 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
// Coefficients for source bytes 21 to 31
-extern "C" TALIGN16(const uint8, madd21[16]) =
+CONST uvec8 kMadd21 =
{ 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
// Coefficients for source bytes 21 to 31
-extern "C" TALIGN16(const int16, round34[8]) =
+CONST vec16 kRound34 =
{ 2, 2, 2, 2, 2, 2, 2, 2 };
-extern "C" TALIGN16(const uint8, shuf38a[16]) =
+CONST uvec8 kShuf38a =
{ 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
-extern "C" TALIGN16(const uint8, shuf38b[16]) =
+CONST uvec8 kShuf38b =
{ 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
// Arrange words 0,3,6 into 0,1,2
-extern "C" TALIGN16(const uint8, shufac0[16]) =
+CONST uvec8 kShufAc =
{ 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
// Arrange words 0,3,6 into 3,4,5
-extern "C" TALIGN16(const uint8, shufac3[16]) =
+CONST uvec8 kShufAc3 =
{ 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
// Scaling values for boxes of 3x3 and 2x3
-extern "C" TALIGN16(const uint16, scaleac3[8]) =
+CONST uvec16 kScaleAc33 =
{ 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
// Arrange first value for pixels 0,1,2,3,4,5
-extern "C" TALIGN16(const uint8, shufab0[16]) =
+CONST uvec8 kShufAb0 =
{ 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
// Arrange second value for pixels 0,1,2,3,4,5
-extern "C" TALIGN16(const uint8, shufab1[16]) =
+CONST uvec8 kShufAb1 =
{ 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
// Arrange third value for pixels 0,1,2,3,4,5
-extern "C" TALIGN16(const uint8, shufab2[16]) =
+CONST uvec8 kShufAb2 =
{ 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
// Scaling values for boxes of 3x2 and 2x2
-extern "C" TALIGN16(const uint16, scaleab2[8]) =
+CONST uvec16 kScaleAb2 =
{ 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
#endif
-#if defined(WIN32) && !defined(COVERAGE_ENABLED)
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
#define HAS_SCALEROWDOWN2_SSE2
// Reads 32 pixels, throws half away and writes 16 pixels.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked)
-static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
mov eax, [esp + 4] // src_ptr
// src_stride ignored
mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width
- pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
- psrlw xmm7, 8
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ align 16
wloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
lea eax, [eax + 32]
- pand xmm0, xmm7
- pand xmm1, xmm7
+ pand xmm0, xmm5
+ pand xmm1, xmm5
packuswb xmm0, xmm1
+ sub ecx, 16
movdqa [edx], xmm0
lea edx, [edx + 16]
- sub ecx, 16
- ja wloop
+ jg wloop
ret
}
}
// Blends 32x2 rectangle to 16x1.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked)
-static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown2Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
__asm {
push esi
mov eax, [esp + 4 + 4] // src_ptr
mov esi, [esp + 4 + 8] // src_stride
mov edx, [esp + 4 + 12] // dst_ptr
mov ecx, [esp + 4 + 16] // dst_width
- pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
- psrlw xmm7, 8
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ align 16
wloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
@@ -319,16 +253,91 @@ static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
psrlw xmm0, 8
movdqa xmm3, xmm1
psrlw xmm1, 8
- pand xmm2, xmm7
- pand xmm3, xmm7
+ pand xmm2, xmm5
+ pand xmm3, xmm5
pavgw xmm0, xmm2
pavgw xmm1, xmm3
packuswb xmm0, xmm1
+ sub ecx, 16
movdqa [edx], xmm0
lea edx, [edx + 16]
+ jg wloop
+
+ pop esi
+ ret
+ }
+}
+
+// Reads 32 pixels, throws half away and writes 16 pixels.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+
+ align 16
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pand xmm0, xmm5
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ sub ecx, 16
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
+
+ ret
+ }
+}
+// Blends 32x2 rectangle to 16x1.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown2Int_Unaligned_SSE2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+
+ align 16
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + esi]
+ movdqu xmm3, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm2 // average rows
+ pavgb xmm1, xmm3
+
+ movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
+ psrlw xmm0, 8
+ movdqa xmm3, xmm1
+ psrlw xmm1, 8
+ pand xmm2, xmm5
+ pand xmm3, xmm5
+ pavgw xmm0, xmm2
+ pavgw xmm1, xmm3
+ packuswb xmm0, xmm1
+
sub ecx, 16
- ja wloop
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
pop esi
ret
@@ -338,63 +347,64 @@ static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
#define HAS_SCALEROWDOWN4_SSE2
// Point samples 32 pixels to 8 pixels.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
-static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
- pushad
- mov esi, [esp + 32 + 4] // src_ptr
+ mov eax, [esp + 4] // src_ptr
// src_stride ignored
- mov edi, [esp + 32 + 12] // dst_ptr
- mov ecx, [esp + 32 + 16] // dst_width
- pcmpeqb xmm7, xmm7 // generate mask 0x000000ff
- psrld xmm7, 24
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+ pcmpeqb xmm5, xmm5 // generate mask 0x000000ff
+ psrld xmm5, 24
+ align 16
wloop:
- movdqa xmm0, [esi]
- movdqa xmm1, [esi + 16]
- lea esi, [esi + 32]
- pand xmm0, xmm7
- pand xmm1, xmm7
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pand xmm0, xmm5
+ pand xmm1, xmm5
packuswb xmm0, xmm1
packuswb xmm0, xmm0
- movq qword ptr [edi], xmm0
- lea edi, [edi + 8]
sub ecx, 8
- ja wloop
+ movq qword ptr [edx], xmm0
+ lea edx, [edx + 8]
+ jg wloop
- popad
ret
}
}
// Blends 32x4 rectangle to 8x1.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
-static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
- pushad
- mov esi, [esp + 32 + 4] // src_ptr
- mov ebx, [esp + 32 + 8] // src_stride
- mov edi, [esp + 32 + 12] // dst_ptr
- mov ecx, [esp + 32 + 16] // dst_width
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_ptr
+ mov esi, [esp + 8 + 8] // src_stride
+ mov edx, [esp + 8 + 12] // dst_ptr
+ mov ecx, [esp + 8 + 16] // dst_width
+ lea edi, [esi + esi * 2] // src_stride * 3
pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
psrlw xmm7, 8
- lea edx, [ebx + ebx * 2] // src_stride * 3
+ align 16
wloop:
- movdqa xmm0, [esi]
- movdqa xmm1, [esi + 16]
- movdqa xmm2, [esi + ebx]
- movdqa xmm3, [esi + ebx + 16]
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + esi]
+ movdqa xmm3, [eax + esi + 16]
pavgb xmm0, xmm2 // average rows
pavgb xmm1, xmm3
- movdqa xmm2, [esi + ebx * 2]
- movdqa xmm3, [esi + ebx * 2 + 16]
- movdqa xmm4, [esi + edx]
- movdqa xmm5, [esi + edx + 16]
- lea esi, [esi + 32]
+ movdqa xmm2, [eax + esi * 2]
+ movdqa xmm3, [eax + esi * 2 + 16]
+ movdqa xmm4, [eax + edi]
+ movdqa xmm5, [eax + edi + 16]
+ lea eax, [eax + 32]
pavgb xmm2, xmm4
pavgb xmm3, xmm5
pavgb xmm0, xmm2
@@ -416,12 +426,13 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
pavgw xmm0, xmm2
packuswb xmm0, xmm0
- movq qword ptr [edi], xmm0
- lea edi, [edi + 8]
sub ecx, 8
- ja wloop
+ movq qword ptr [edx], xmm0
+ lea edx, [edx + 8]
+ jg wloop
- popad
+ pop edi
+ pop esi
ret
}
}
@@ -429,64 +440,66 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
#define HAS_SCALEROWDOWN8_SSE2
// Point samples 32 pixels to 4 pixels.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
-__declspec(naked)
-static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown8_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
- pushad
- mov esi, [esp + 32 + 4] // src_ptr
+ mov eax, [esp + 4] // src_ptr
// src_stride ignored
- mov edi, [esp + 32 + 12] // dst_ptr
- mov ecx, [esp + 32 + 16] // dst_width
- pcmpeqb xmm7, xmm7 // generate mask isolating 1 src 8 bytes
- psrlq xmm7, 56
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+ pcmpeqb xmm5, xmm5 // generate mask isolating 1 src 8 bytes
+ psrlq xmm5, 56
+ align 16
wloop:
- movdqa xmm0, [esi]
- movdqa xmm1, [esi + 16]
- lea esi, [esi + 32]
- pand xmm0, xmm7
- pand xmm1, xmm7
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pand xmm0, xmm5
+ pand xmm1, xmm5
packuswb xmm0, xmm1 // 32->16
packuswb xmm0, xmm0 // 16->8
packuswb xmm0, xmm0 // 8->4
- movd dword ptr [edi], xmm0
- lea edi, [edi + 4]
sub ecx, 4
- ja wloop
+ movd dword ptr [edx], xmm0
+ lea edx, [edx + 4]
+ jg wloop
- popad
ret
}
}
// Blends 32x8 rectangle to 4x1.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
-__declspec(naked)
-static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
- pushad
- mov esi, [esp + 32 + 4] // src_ptr
- mov ebx, [esp + 32 + 8] // src_stride
- mov edi, [esp + 32 + 12] // dst_ptr
- mov ecx, [esp + 32 + 16] // dst_width
- lea edx, [ebx + ebx * 2] // src_stride * 3
+ push esi
+ push edi
+ push ebp
+ mov eax, [esp + 12 + 4] // src_ptr
+ mov esi, [esp + 12 + 8] // src_stride
+ mov edx, [esp + 12 + 12] // dst_ptr
+ mov ecx, [esp + 12 + 16] // dst_width
+ lea edi, [esi + esi * 2] // src_stride * 3
pxor xmm7, xmm7
+ align 16
wloop:
- movdqa xmm0, [esi] // average 8 rows to 1
- movdqa xmm1, [esi + 16]
- movdqa xmm2, [esi + ebx]
- movdqa xmm3, [esi + ebx + 16]
+ movdqa xmm0, [eax] // average 8 rows to 1
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + esi]
+ movdqa xmm3, [eax + esi + 16]
pavgb xmm0, xmm2
pavgb xmm1, xmm3
- movdqa xmm2, [esi + ebx * 2]
- movdqa xmm3, [esi + ebx * 2 + 16]
- movdqa xmm4, [esi + edx]
- movdqa xmm5, [esi + edx + 16]
- lea ebp, [esi + ebx * 4]
- lea esi, [esi + 32]
+ movdqa xmm2, [eax + esi * 2]
+ movdqa xmm3, [eax + esi * 2 + 16]
+ movdqa xmm4, [eax + edi]
+ movdqa xmm5, [eax + edi + 16]
+ lea ebp, [eax + esi * 4]
+ lea eax, [eax + 32]
pavgb xmm2, xmm4
pavgb xmm3, xmm5
pavgb xmm0, xmm2
@@ -494,15 +507,15 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
movdqa xmm2, [ebp]
movdqa xmm3, [ebp + 16]
- movdqa xmm4, [ebp + ebx]
- movdqa xmm5, [ebp + ebx + 16]
+ movdqa xmm4, [ebp + esi]
+ movdqa xmm5, [ebp + esi + 16]
pavgb xmm2, xmm4
pavgb xmm3, xmm5
- movdqa xmm4, [ebp + ebx * 2]
- movdqa xmm5, [ebp + ebx * 2 + 16]
- movdqa xmm6, [ebp + edx]
+ movdqa xmm4, [ebp + esi * 2]
+ movdqa xmm5, [ebp + esi * 2 + 16]
+ movdqa xmm6, [ebp + edi]
pavgb xmm4, xmm6
- movdqa xmm6, [ebp + edx + 16]
+ movdqa xmm6, [ebp + edi + 16]
pavgb xmm5, xmm6
pavgb xmm2, xmm4
pavgb xmm3, xmm5
@@ -517,60 +530,61 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
psrlw xmm0, 3
packuswb xmm0, xmm0
packuswb xmm0, xmm0
- movd dword ptr [edi], xmm0
- lea edi, [edi + 4]
sub ecx, 4
- ja wloop
+ movd dword ptr [edx], xmm0
+ lea edx, [edx + 4]
+ jg wloop
- popad
+ pop ebp
+ pop edi
+ pop esi
ret
}
}
#define HAS_SCALEROWDOWN34_SSSE3
// Point samples 32 pixels to 24 pixels.
-// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
// Then shuffled to do the scaling.
// Note that movdqa+palign may be better than movdqu.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
-static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
- pushad
- mov esi, [esp + 32 + 4] // src_ptr
+ mov eax, [esp + 4] // src_ptr
// src_stride ignored
- mov edi, [esp + 32 + 12] // dst_ptr
- mov ecx, [esp + 32 + 16] // dst_width
- movdqa xmm3, _shuf0
- movdqa xmm4, _shuf1
- movdqa xmm5, _shuf2
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+ movdqa xmm3, kShuf0
+ movdqa xmm4, kShuf1
+ movdqa xmm5, kShuf2
+ align 16
wloop:
- movdqa xmm0, [esi]
- movdqa xmm2, [esi + 16]
- lea esi, [esi + 32]
- movdqa xmm1, xmm2
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ movdqa xmm2, xmm1
palignr xmm1, xmm0, 8
pshufb xmm0, xmm3
pshufb xmm1, xmm4
pshufb xmm2, xmm5
- movq qword ptr [edi], xmm0
- movq qword ptr [edi + 8], xmm1
- movq qword ptr [edi + 16], xmm2
- lea edi, [edi + 24]
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + 8], xmm1
+ movq qword ptr [edx + 16], xmm2
+ lea edx, [edx + 24]
sub ecx, 24
- ja wloop
+ jg wloop
- popad
ret
}
}
// Blends 32x2 rectangle to 24x1
-// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
// Then shuffled to do the scaling.
// Register usage:
@@ -581,86 +595,90 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
// xmm4 shuf 2
// xmm5 madd 0
// xmm6 madd 1
-// xmm7 round34
+// xmm7 kRound34
// Note that movdqa+palign may be better than movdqu.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
-static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
- pushad
- mov esi, [esp + 32 + 4] // src_ptr
- mov ebx, [esp + 32 + 8] // src_stride
- mov edi, [esp + 32 + 12] // dst_ptr
- mov ecx, [esp + 32 + 16] // dst_width
- movdqa xmm2, _shuf01
- movdqa xmm3, _shuf11
- movdqa xmm4, _shuf21
- movdqa xmm5, _madd01
- movdqa xmm6, _madd11
- movdqa xmm7, _round34
-
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ movdqa xmm2, kShuf01
+ movdqa xmm3, kShuf11
+ movdqa xmm4, kShuf21
+ movdqa xmm5, kMadd01
+ movdqa xmm6, kMadd11
+ movdqa xmm7, kRound34
+
+ align 16
wloop:
- movdqa xmm0, [esi] // pixels 0..7
- movdqa xmm1, [esi+ebx]
+ movdqa xmm0, [eax] // pixels 0..7
+ movdqa xmm1, [eax + esi]
pavgb xmm0, xmm1
pshufb xmm0, xmm2
pmaddubsw xmm0, xmm5
paddsw xmm0, xmm7
psrlw xmm0, 2
packuswb xmm0, xmm0
- movq qword ptr [edi], xmm0
- movdqu xmm0, [esi+8] // pixels 8..15
- movdqu xmm1, [esi+ebx+8]
+ movq qword ptr [edx], xmm0
+ movdqu xmm0, [eax + 8] // pixels 8..15
+ movdqu xmm1, [eax + esi + 8]
pavgb xmm0, xmm1
pshufb xmm0, xmm3
pmaddubsw xmm0, xmm6
paddsw xmm0, xmm7
psrlw xmm0, 2
packuswb xmm0, xmm0
- movq qword ptr [edi+8], xmm0
- movdqa xmm0, [esi+16] // pixels 16..23
- movdqa xmm1, [esi+ebx+16]
- lea esi, [esi+32]
+ movq qword ptr [edx + 8], xmm0
+ movdqa xmm0, [eax + 16] // pixels 16..23
+ movdqa xmm1, [eax + esi + 16]
+ lea eax, [eax + 32]
pavgb xmm0, xmm1
pshufb xmm0, xmm4
- movdqa xmm1, _madd21
+ movdqa xmm1, kMadd21
pmaddubsw xmm0, xmm1
paddsw xmm0, xmm7
psrlw xmm0, 2
packuswb xmm0, xmm0
- movq qword ptr [edi+16], xmm0
- lea edi, [edi+24]
sub ecx, 24
- ja wloop
+ movq qword ptr [edx + 16], xmm0
+ lea edx, [edx + 24]
+ jg wloop
- popad
+ pop esi
ret
}
}
// Note that movdqa+palign may be better than movdqu.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
-static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
- pushad
- mov esi, [esp + 32 + 4] // src_ptr
- mov ebx, [esp + 32 + 8] // src_stride
- mov edi, [esp + 32 + 12] // dst_ptr
- mov ecx, [esp + 32 + 16] // dst_width
- movdqa xmm2, _shuf01
- movdqa xmm3, _shuf11
- movdqa xmm4, _shuf21
- movdqa xmm5, _madd01
- movdqa xmm6, _madd11
- movdqa xmm7, _round34
-
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ movdqa xmm2, kShuf01
+ movdqa xmm3, kShuf11
+ movdqa xmm4, kShuf21
+ movdqa xmm5, kMadd01
+ movdqa xmm6, kMadd11
+ movdqa xmm7, kRound34
+
+ align 16
wloop:
- movdqa xmm0, [esi] // pixels 0..7
- movdqa xmm1, [esi+ebx]
+ movdqa xmm0, [eax] // pixels 0..7
+ movdqa xmm1, [eax + esi]
pavgb xmm1, xmm0
pavgb xmm0, xmm1
pshufb xmm0, xmm2
@@ -668,9 +686,9 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
paddsw xmm0, xmm7
psrlw xmm0, 2
packuswb xmm0, xmm0
- movq qword ptr [edi], xmm0
- movdqu xmm0, [esi+8] // pixels 8..15
- movdqu xmm1, [esi+ebx+8]
+ movq qword ptr [edx], xmm0
+ movdqu xmm0, [eax + 8] // pixels 8..15
+ movdqu xmm1, [eax + esi + 8]
pavgb xmm1, xmm0
pavgb xmm0, xmm1
pshufb xmm0, xmm3
@@ -678,24 +696,24 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
paddsw xmm0, xmm7
psrlw xmm0, 2
packuswb xmm0, xmm0
- movq qword ptr [edi+8], xmm0
- movdqa xmm0, [esi+16] // pixels 16..23
- movdqa xmm1, [esi+ebx+16]
- lea esi, [esi+32]
+ movq qword ptr [edx + 8], xmm0
+ movdqa xmm0, [eax + 16] // pixels 16..23
+ movdqa xmm1, [eax + esi + 16]
+ lea eax, [eax + 32]
pavgb xmm1, xmm0
pavgb xmm0, xmm1
pshufb xmm0, xmm4
- movdqa xmm1, _madd21
+ movdqa xmm1, kMadd21
pmaddubsw xmm0, xmm1
paddsw xmm0, xmm7
psrlw xmm0, 2
packuswb xmm0, xmm0
- movq qword ptr [edi+16], xmm0
- lea edi, [edi+24]
sub ecx, 24
- ja wloop
+ movq qword ptr [edx + 16], xmm0
+ lea edx, [edx+24]
+ jg wloop
- popad
+ pop esi
ret
}
}
@@ -704,202 +722,219 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
// 3/8 point sampler
// Scale 32 pixels to 12
-__declspec(naked)
-static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
- pushad
- mov esi, [esp + 32 + 4] // src_ptr
- mov edx, [esp + 32 + 8] // src_stride
- mov edi, [esp + 32 + 12] // dst_ptr
- mov ecx, [esp + 32 + 16] // dst_width
- movdqa xmm5, _shuf38a
- movdqa xmm6, _shuf38b
- pxor xmm7, xmm7
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+ movdqa xmm4, kShuf38a
+ movdqa xmm5, kShuf38b
+ align 16
xloop:
- movdqa xmm0, [esi] // 16 pixels -> 0,1,2,3,4,5
- movdqa xmm1, [esi + 16] // 16 pixels -> 6,7,8,9,10,11
- lea esi, [esi + 32]
- pshufb xmm0, xmm5
- pshufb xmm1, xmm6
+ movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
+ movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
+ lea eax, [eax + 32]
+ pshufb xmm0, xmm4
+ pshufb xmm1, xmm5
paddusb xmm0, xmm1
- movq qword ptr [edi], xmm0 // write 12 pixels
- movhlps xmm1, xmm0
- movd [edi + 8], xmm1
- lea edi, [edi + 12]
sub ecx, 12
- ja xloop
+ movq qword ptr [edx], xmm0 // write 12 pixels
+ movhlps xmm1, xmm0
+ movd [edx + 8], xmm1
+ lea edx, [edx + 12]
+ jg xloop
- popad
ret
}
}
// Scale 16x3 pixels to 6x1 with interpolation
-__declspec(naked)
-static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
- pushad
- mov esi, [esp + 32 + 4] // src_ptr
- mov edx, [esp + 32 + 8] // src_stride
- mov edi, [esp + 32 + 12] // dst_ptr
- mov ecx, [esp + 32 + 16] // dst_width
- movdqa xmm4, _shufac0
- movdqa xmm5, _shufac3
- movdqa xmm6, _scaleac3
- pxor xmm7, xmm7
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ movdqa xmm2, kShufAc
+ movdqa xmm3, kShufAc3
+ movdqa xmm4, kScaleAc33
+ pxor xmm5, xmm5
+ align 16
xloop:
- movdqa xmm0, [esi] // sum up 3 rows into xmm0/1
- movdqa xmm2, [esi + edx]
+ movdqa xmm0, [eax] // sum up 3 rows into xmm0/1
+ movdqa xmm6, [eax + esi]
movhlps xmm1, xmm0
- movhlps xmm3, xmm2
- punpcklbw xmm0, xmm7
- punpcklbw xmm1, xmm7
- punpcklbw xmm2, xmm7
- punpcklbw xmm3, xmm7
- paddusw xmm0, xmm2
- paddusw xmm1, xmm3
- movdqa xmm2, [esi + edx * 2]
- lea esi, [esi + 16]
- movhlps xmm3, xmm2
- punpcklbw xmm2, xmm7
- punpcklbw xmm3, xmm7
- paddusw xmm0, xmm2
- paddusw xmm1, xmm3
-
- movdqa xmm2, xmm0 // 8 pixels -> 0,1,2 of xmm2
+ movhlps xmm7, xmm6
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ punpcklbw xmm6, xmm5
+ punpcklbw xmm7, xmm5
+ paddusw xmm0, xmm6
+ paddusw xmm1, xmm7
+ movdqa xmm6, [eax + esi * 2]
+ lea eax, [eax + 16]
+ movhlps xmm7, xmm6
+ punpcklbw xmm6, xmm5
+ punpcklbw xmm7, xmm5
+ paddusw xmm0, xmm6
+ paddusw xmm1, xmm7
+
+ movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6
psrldq xmm0, 2
- paddusw xmm2, xmm0
+ paddusw xmm6, xmm0
psrldq xmm0, 2
- paddusw xmm2, xmm0
- pshufb xmm2, xmm4
+ paddusw xmm6, xmm0
+ pshufb xmm6, xmm2
- movdqa xmm3, xmm1 // 8 pixels -> 3,4,5 of xmm2
+ movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6
psrldq xmm1, 2
- paddusw xmm3, xmm1
+ paddusw xmm7, xmm1
psrldq xmm1, 2
- paddusw xmm3, xmm1
- pshufb xmm3, xmm5
- paddusw xmm2, xmm3
+ paddusw xmm7, xmm1
+ pshufb xmm7, xmm3
+ paddusw xmm6, xmm7
- pmulhuw xmm2, xmm6 // divide by 9,9,6, 9,9,6
- packuswb xmm2, xmm2
+ pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6
+ packuswb xmm6, xmm6
- movd [edi], xmm2 // write 6 pixels
- pextrw eax, xmm2, 2
- mov [edi + 4], ax
- lea edi, [edi + 6]
sub ecx, 6
- ja xloop
+ movd [edx], xmm6 // write 6 pixels
+ psrlq xmm6, 16
+ movd [edx + 2], xmm6
+ lea edx, [edx + 6]
+ jg xloop
- popad
+ pop esi
ret
}
}
// Scale 16x2 pixels to 6x1 with interpolation
-__declspec(naked)
-static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
- pushad
- mov esi, [esp + 32 + 4] // src_ptr
- mov edx, [esp + 32 + 8] // src_stride
- mov edi, [esp + 32 + 12] // dst_ptr
- mov ecx, [esp + 32 + 16] // dst_width
- movdqa xmm4, _shufab0
- movdqa xmm5, _shufab1
- movdqa xmm6, _shufab2
- movdqa xmm7, _scaleab2
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ movdqa xmm2, kShufAb0
+ movdqa xmm3, kShufAb1
+ movdqa xmm4, kShufAb2
+ movdqa xmm5, kScaleAb2
+ align 16
xloop:
- movdqa xmm2, [esi] // average 2 rows into xmm2
- pavgb xmm2, [esi + edx]
- lea esi, [esi + 16]
-
- movdqa xmm0, xmm2 // 16 pixels -> 0,1,2,3,4,5 of xmm0
+ movdqa xmm0, [eax] // average 2 rows into xmm0
+ pavgb xmm0, [eax + esi]
+ lea eax, [eax + 16]
+
+ movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1
+ pshufb xmm1, xmm2
+ movdqa xmm6, xmm0
+ pshufb xmm6, xmm3
+ paddusw xmm1, xmm6
pshufb xmm0, xmm4
- movdqa xmm1, xmm2
- pshufb xmm1, xmm5
- paddusw xmm0, xmm1
- pshufb xmm2, xmm6
- paddusw xmm0, xmm2
+ paddusw xmm1, xmm0
- pmulhuw xmm0, xmm7 // divide by 3,3,2, 3,3,2
- packuswb xmm0, xmm0
+ pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2
+ packuswb xmm1, xmm1
- movd [edi], xmm0 // write 6 pixels
- pextrw eax, xmm0, 2
- mov [edi + 4], ax
- lea edi, [edi + 6]
sub ecx, 6
- ja xloop
+ movd [edx], xmm1 // write 6 pixels
+ psrlq xmm1, 16
+ movd [edx + 2], xmm1
+ lea edx, [edx + 6]
+ jg xloop
- popad
+ pop esi
ret
}
}
#define HAS_SCALEADDROWS_SSE2
-// Reads 8xN bytes and produces 16 shorts at a time.
-__declspec(naked)
-static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
+// Reads 16xN bytes and produces 16 shorts at a time.
+__declspec(naked) __declspec(align(16))
+static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width,
int src_height) {
__asm {
- pushad
- mov esi, [esp + 32 + 4] // src_ptr
- mov edx, [esp + 32 + 8] // src_stride
- mov edi, [esp + 32 + 12] // dst_ptr
- mov ecx, [esp + 32 + 16] // dst_width
- mov ebx, [esp + 32 + 20] // height
- pxor xmm7, xmm7
+ push esi
+ push edi
+ push ebx
+ push ebp
+ mov esi, [esp + 16 + 4] // src_ptr
+ mov edx, [esp + 16 + 8] // src_stride
+ mov edi, [esp + 16 + 12] // dst_ptr
+ mov ecx, [esp + 16 + 16] // dst_width
+ mov ebx, [esp + 16 + 20] // height
+ pxor xmm4, xmm4
dec ebx
+ align 16
xloop:
// first row
- movdqa xmm2, [esi]
+ movdqa xmm0, [esi]
lea eax, [esi + edx]
- movhlps xmm3, xmm2
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm4
+ punpckhbw xmm1, xmm4
+ lea esi, [esi + 16]
mov ebp, ebx
- punpcklbw xmm2, xmm7
- punpcklbw xmm3, xmm7
+ test ebp, ebp
+ je ydone
// sum remaining rows
+ align 16
yloop:
- movdqa xmm0, [eax] // read 16 pixels
+ movdqa xmm2, [eax] // read 16 pixels
lea eax, [eax + edx] // advance to next row
- movhlps xmm1, xmm0
- punpcklbw xmm0, xmm7
- punpcklbw xmm1, xmm7
- paddusw xmm2, xmm0 // sum 16 words
- paddusw xmm3, xmm1
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, xmm4
+ punpckhbw xmm3, xmm4
+ paddusw xmm0, xmm2 // sum 16 words
+ paddusw xmm1, xmm3
sub ebp, 1
- ja yloop
-
- movdqa [edi], xmm2
- movdqa [edi + 16], xmm3
+ jg yloop
+ ydone:
+ movdqa [edi], xmm0
+ movdqa [edi + 16], xmm1
lea edi, [edi + 32]
- lea esi, [esi + 16]
sub ecx, 16
- ja xloop
+ jg xloop
- popad
+ pop ebp
+ pop ebx
+ pop edi
+ pop esi
ret
}
}
+#ifndef SSE2_DISABLED
// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version.
-#define HAS_SCALEFILTERROWS_SSE2
-__declspec(naked)
+// Normal formula for bilinear interpolation is:
+// source_y_fraction * row1 + (1 - source_y_fraction) row0
+// SSE2 version using the a single multiply of difference:
+// source_y_fraction * (row1 - row0) + row0
+#define HAS_SCALEFILTERROWS_SSE2_DISABLED
+__declspec(naked) __declspec(align(16))
static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
- int src_stride, int dst_width,
+ ptrdiff_t src_stride, int dst_width,
int source_y_fraction) {
__asm {
push esi
@@ -909,88 +944,88 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
mov edx, [esp + 8 + 12] // src_stride
mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
+ sub edi, esi
cmp eax, 0
je xloop1
cmp eax, 128
je xloop2
- movd xmm6, eax // xmm6 = y fraction
- punpcklwd xmm6, xmm6
- pshufd xmm6, xmm6, 0
- neg eax // xmm5 = 256 - y fraction
- add eax, 256
- movd xmm5, eax
+ movd xmm5, eax // xmm5 = y fraction
+ punpcklbw xmm5, xmm5
punpcklwd xmm5, xmm5
pshufd xmm5, xmm5, 0
- pxor xmm7, xmm7
+ pxor xmm4, xmm4
+ align 16
xloop:
- movdqa xmm0, [esi]
- movdqa xmm2, [esi + edx]
- lea esi, [esi + 16]
+ movdqa xmm0, [esi] // row0
+ movdqa xmm2, [esi + edx] // row1
movdqa xmm1, xmm0
movdqa xmm3, xmm2
- punpcklbw xmm0, xmm7
- punpcklbw xmm2, xmm7
- punpckhbw xmm1, xmm7
- punpckhbw xmm3, xmm7
- pmullw xmm0, xmm5 // scale row 0
- pmullw xmm1, xmm5
- pmullw xmm2, xmm6 // scale row 1
- pmullw xmm3, xmm6
- paddusw xmm0, xmm2 // sum rows
- paddusw xmm1, xmm3
- psrlw xmm0, 8
- psrlw xmm1, 8
+ punpcklbw xmm2, xmm4
+ punpckhbw xmm3, xmm4
+ punpcklbw xmm0, xmm4
+ punpckhbw xmm1, xmm4
+ psubw xmm2, xmm0 // row1 - row0
+ psubw xmm3, xmm1
+ pmulhw xmm2, xmm5 // scale diff
+ pmulhw xmm3, xmm5
+ paddw xmm0, xmm2 // sum rows
+ paddw xmm1, xmm3
packuswb xmm0, xmm1
- movdqa [edi], xmm0
- lea edi, [edi + 16]
sub ecx, 16
- ja xloop
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop
- mov al, [edi - 1]
- mov [edi], al
+ punpckhbw xmm0, xmm0 // duplicate last pixel for filtering
+ pshufhw xmm0, xmm0, 0xff
+ punpckhqdq xmm0, xmm0
+ movdqa [esi + edi], xmm0
pop edi
pop esi
ret
+ align 16
xloop1:
movdqa xmm0, [esi]
- lea esi, [esi + 16]
- movdqa [edi], xmm0
- lea edi, [edi + 16]
sub ecx, 16
- ja xloop1
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop1
- mov al, [edi - 1]
- mov [edi], al
+ punpckhbw xmm0, xmm0 // duplicate last pixel for filtering
+ pshufhw xmm0, xmm0, 0xff
+ punpckhqdq xmm0, xmm0
+ movdqa [esi + edi], xmm0
pop edi
pop esi
ret
+ align 16
xloop2:
movdqa xmm0, [esi]
- movdqa xmm2, [esi + edx]
- lea esi, [esi + 16]
- pavgb xmm0, xmm2
- movdqa [edi], xmm0
- lea edi, [edi + 16]
+ pavgb xmm0, [esi + edx]
sub ecx, 16
- ja xloop2
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop2
- mov al, [edi - 1]
- mov [edi], al
+ punpckhbw xmm0, xmm0 // duplicate last pixel for filtering
+ pshufhw xmm0, xmm0, 0xff
+ punpckhqdq xmm0, xmm0
+ movdqa [esi + edi], xmm0
pop edi
pop esi
ret
}
}
-
+#endif // SSE2_DISABLED
// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version.
#define HAS_SCALEFILTERROWS_SSSE3
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
- int src_stride, int dst_width,
+ ptrdiff_t src_stride, int dst_width,
int source_y_fraction) {
__asm {
push esi
@@ -1000,1491 +1035,996 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
mov edx, [esp + 8 + 12] // src_stride
mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
+ sub edi, esi
+ shr eax, 1
cmp eax, 0
je xloop1
- cmp eax, 128
+ cmp eax, 64
je xloop2
+ movd xmm0, eax // high fraction 0..127
+ neg eax
+ add eax, 128
+ movd xmm5, eax // low fraction 128..1
+ punpcklbw xmm5, xmm0
+ punpcklwd xmm5, xmm5
+ pshufd xmm5, xmm5, 0
- shr eax, 1
- mov ah,al
- neg al
- add al, 128
- movd xmm7, eax
- punpcklwd xmm7, xmm7
- pshufd xmm7, xmm7, 0
-
+ align 16
xloop:
movdqa xmm0, [esi]
movdqa xmm2, [esi + edx]
- lea esi, [esi + 16]
movdqa xmm1, xmm0
punpcklbw xmm0, xmm2
punpckhbw xmm1, xmm2
- pmaddubsw xmm0, xmm7
- pmaddubsw xmm1, xmm7
+ pmaddubsw xmm0, xmm5
+ pmaddubsw xmm1, xmm5
psrlw xmm0, 7
psrlw xmm1, 7
packuswb xmm0, xmm1
- movdqa [edi], xmm0
- lea edi, [edi + 16]
sub ecx, 16
- ja xloop
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop
+
+ punpckhbw xmm0, xmm0 // duplicate last pixel for filtering
+ pshufhw xmm0, xmm0, 0xff
+ punpckhqdq xmm0, xmm0
+ movdqa [esi + edi], xmm0
- mov al, [edi - 1]
- mov [edi], al
pop edi
pop esi
ret
+ align 16
xloop1:
movdqa xmm0, [esi]
- lea esi, [esi + 16]
- movdqa [edi], xmm0
- lea edi, [edi + 16]
sub ecx, 16
- ja xloop1
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop1
- mov al, [edi - 1]
- mov [edi], al
+ punpckhbw xmm0, xmm0
+ pshufhw xmm0, xmm0, 0xff
+ punpckhqdq xmm0, xmm0
+ movdqa [esi + edi], xmm0
pop edi
pop esi
ret
+ align 16
xloop2:
movdqa xmm0, [esi]
- movdqa xmm2, [esi + edx]
- lea esi, [esi + 16]
- pavgb xmm0, xmm2
- movdqa [edi], xmm0
- lea edi, [edi + 16]
+ pavgb xmm0, [esi + edx]
sub ecx, 16
- ja xloop2
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop2
- mov al, [edi - 1]
- mov [edi], al
+ punpckhbw xmm0, xmm0
+ pshufhw xmm0, xmm0, 0xff
+ punpckhqdq xmm0, xmm0
+ movdqa [esi + edi], xmm0
pop edi
pop esi
ret
-
}
}
-// Note that movdqa+palign may be better than movdqu.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
-static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width) {
- __asm {
- mov edx, [esp + 4] // dst_ptr
- mov eax, [esp + 8] // src_ptr
- mov ecx, [esp + 12] // dst_width
- movdqa xmm1, _round34
- movdqa xmm2, _shuf01
- movdqa xmm3, _shuf11
- movdqa xmm4, _shuf21
- movdqa xmm5, _madd01
- movdqa xmm6, _madd11
- movdqa xmm7, _madd21
-
- wloop:
- movdqa xmm0, [eax] // pixels 0..7
- pshufb xmm0, xmm2
- pmaddubsw xmm0, xmm5
- paddsw xmm0, xmm1
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- movq qword ptr [edx], xmm0
- movdqu xmm0, [eax+8] // pixels 8..15
- pshufb xmm0, xmm3
- pmaddubsw xmm0, xmm6
- paddsw xmm0, xmm1
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- movq qword ptr [edx+8], xmm0
- movdqa xmm0, [eax+16] // pixels 16..23
- lea eax, [eax+32]
- pshufb xmm0, xmm4
- pmaddubsw xmm0, xmm7
- paddsw xmm0, xmm1
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- movq qword ptr [edx+16], xmm0
- lea edx, [edx+24]
- sub ecx, 24
- ja wloop
- ret
- }
-}
-
-#elif (defined(__x86_64__) || defined(__i386__)) && \
- !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
// GCC versions of row functions are verbatim conversions from Visual C.
// Generated using gcc disassembly on Visual C object file:
// objdump -D yuvscaler.obj >yuvscaler.txt
#define HAS_SCALEROWDOWN2_SSE2
-static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
- asm volatile(
- "pcmpeqb %%xmm7,%%xmm7\n"
- "psrlw $0x8,%%xmm7\n"
-"1:"
- "movdqa (%0),%%xmm0\n"
- "movdqa 0x10(%0),%%xmm1\n"
- "lea 0x20(%0),%0\n"
- "pand %%xmm7,%%xmm0\n"
- "pand %%xmm7,%%xmm1\n"
- "packuswb %%xmm1,%%xmm0\n"
- "movdqa %%xmm0,(%1)\n"
- "lea 0x10(%1),%1\n"
- "sub $0x10,%2\n"
- "ja 1b\n"
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:
- : "memory"
-);
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
}
-static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile(
- "pcmpeqb %%xmm7,%%xmm7\n"
- "psrlw $0x8,%%xmm7\n"
-"1:"
- "movdqa (%0),%%xmm0\n"
- "movdqa 0x10(%0),%%xmm1\n"
- "movdqa (%0,%3,1),%%xmm2\n"
- "movdqa 0x10(%0,%3,1),%%xmm3\n"
- "lea 0x20(%0),%0\n"
- "pavgb %%xmm2,%%xmm0\n"
- "pavgb %%xmm3,%%xmm1\n"
- "movdqa %%xmm0,%%xmm2\n"
- "psrlw $0x8,%%xmm0\n"
- "movdqa %%xmm1,%%xmm3\n"
- "psrlw $0x8,%%xmm1\n"
- "pand %%xmm7,%%xmm2\n"
- "pand %%xmm7,%%xmm3\n"
- "pavgw %%xmm2,%%xmm0\n"
- "pavgw %%xmm3,%%xmm1\n"
- "packuswb %%xmm1,%%xmm0\n"
- "movdqa %%xmm0,(%1)\n"
- "lea 0x10(%1),%1\n"
- "sub $0x10,%2\n"
- "ja 1b\n"
+void ScaleRowDown2Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa (%0,%3,1),%%xmm2 \n"
+ "movdqa 0x10(%0,%3,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "pand %%xmm5,%%xmm3 \n"
+ "pavgw %%xmm2,%%xmm0 \n"
+ "pavgw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(static_cast<intptr_t>(src_stride)) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+
+static void ScaleRowDown2Int_Unaligned_SSE2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu (%0,%3,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "pand %%xmm5,%%xmm3 \n"
+ "pavgw %%xmm2,%%xmm0 \n"
+ "pavgw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(static_cast<intptr_t>(src_stride)) // %3
- : "memory"
-);
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
}
#define HAS_SCALEROWDOWN4_SSE2
-static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
- asm volatile(
- "pcmpeqb %%xmm7,%%xmm7\n"
- "psrld $0x18,%%xmm7\n"
-"1:"
- "movdqa (%0),%%xmm0\n"
- "movdqa 0x10(%0),%%xmm1\n"
- "lea 0x20(%0),%0\n"
- "pand %%xmm7,%%xmm0\n"
- "pand %%xmm7,%%xmm1\n"
- "packuswb %%xmm1,%%xmm0\n"
- "packuswb %%xmm0,%%xmm0\n"
- "movq %%xmm0,(%1)\n"
- "lea 0x8(%1),%1\n"
- "sub $0x8,%2\n"
- "ja 1b\n"
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrld $0x18,%%xmm5 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:
- : "memory"
-);
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
}
-static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
- intptr_t temp = 0;
- asm volatile(
- "pcmpeqb %%xmm7,%%xmm7\n"
- "psrlw $0x8,%%xmm7\n"
- "lea (%4,%4,2),%3\n"
-"1:"
- "movdqa (%0),%%xmm0\n"
- "movdqa 0x10(%0),%%xmm1\n"
- "movdqa (%0,%4,1),%%xmm2\n"
- "movdqa 0x10(%0,%4,1),%%xmm3\n"
- "pavgb %%xmm2,%%xmm0\n"
- "pavgb %%xmm3,%%xmm1\n"
- "movdqa (%0,%4,2),%%xmm2\n"
- "movdqa 0x10(%0,%4,2),%%xmm3\n"
- "movdqa (%0,%3,1),%%xmm4\n"
- "movdqa 0x10(%0,%3,1),%%xmm5\n"
- "lea 0x20(%0),%0\n"
- "pavgb %%xmm4,%%xmm2\n"
- "pavgb %%xmm2,%%xmm0\n"
- "pavgb %%xmm5,%%xmm3\n"
- "pavgb %%xmm3,%%xmm1\n"
- "movdqa %%xmm0,%%xmm2\n"
- "psrlw $0x8,%%xmm0\n"
- "movdqa %%xmm1,%%xmm3\n"
- "psrlw $0x8,%%xmm1\n"
- "pand %%xmm7,%%xmm2\n"
- "pand %%xmm7,%%xmm3\n"
- "pavgw %%xmm2,%%xmm0\n"
- "pavgw %%xmm3,%%xmm1\n"
- "packuswb %%xmm1,%%xmm0\n"
- "movdqa %%xmm0,%%xmm2\n"
- "psrlw $0x8,%%xmm0\n"
- "pand %%xmm7,%%xmm2\n"
- "pavgw %%xmm2,%%xmm0\n"
- "packuswb %%xmm0,%%xmm0\n"
- "movq %%xmm0,(%1)\n"
- "lea 0x8(%1),%1\n"
- "sub $0x8,%2\n"
- "ja 1b\n"
+ intptr_t stridex3 = 0;
+ asm volatile (
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $0x8,%%xmm7 \n"
+ "lea (%4,%4,2),%3 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa (%0,%4,1),%%xmm2 \n"
+ "movdqa 0x10(%0,%4,1),%%xmm3 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa (%0,%4,2),%%xmm2 \n"
+ "movdqa 0x10(%0,%4,2),%%xmm3 \n"
+ "movdqa (%0,%3,1),%%xmm4 \n"
+ "movdqa 0x10(%0,%3,1),%%xmm5 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm4,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm5,%%xmm3 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "pand %%xmm7,%%xmm2 \n"
+ "pand %%xmm7,%%xmm3 \n"
+ "pavgw %%xmm2,%%xmm0 \n"
+ "pavgw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "pand %%xmm7,%%xmm2 \n"
+ "pavgw %%xmm2,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
- "+r"(temp) // %3
+ "+r"(stridex3) // %3
: "r"(static_cast<intptr_t>(src_stride)) // %4
- : "memory"
-);
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
+#endif
+ );
}
#define HAS_SCALEROWDOWN8_SSE2
-static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown8_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
- asm volatile(
- "pcmpeqb %%xmm7,%%xmm7\n"
- "psrlq $0x38,%%xmm7\n"
-"1:"
- "movdqa (%0),%%xmm0\n"
- "movdqa 0x10(%0),%%xmm1\n"
- "lea 0x20(%0),%0\n"
- "pand %%xmm7,%%xmm0\n"
- "pand %%xmm7,%%xmm1\n"
- "packuswb %%xmm1,%%xmm0\n"
- "packuswb %%xmm0,%%xmm0\n"
- "packuswb %%xmm0,%%xmm0\n"
- "movd %%xmm0,(%1)\n"
- "lea 0x4(%1),%1\n"
- "sub $0x4,%2\n"
- "ja 1b\n"
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlq $0x38,%%xmm5 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0,(%1) \n"
+ "lea 0x4(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:
- : "memory"
-);
-}
-
-#if defined(__i386__)
-extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width);
- asm(
- ".text\n"
-#if defined(OSX)
- ".globl _ScaleRowDown8Int_SSE2\n"
-"_ScaleRowDown8Int_SSE2:\n"
-#else
- ".global ScaleRowDown8Int_SSE2\n"
-"ScaleRowDown8Int_SSE2:\n"
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
#endif
- "pusha\n"
- "mov 0x24(%esp),%esi\n"
- "mov 0x28(%esp),%ebx\n"
- "mov 0x2c(%esp),%edi\n"
- "mov 0x30(%esp),%ecx\n"
- "lea (%ebx,%ebx,2),%edx\n"
- "pxor %xmm7,%xmm7\n"
-
-"1:"
- "movdqa (%esi),%xmm0\n"
- "movdqa 0x10(%esi),%xmm1\n"
- "movdqa (%esi,%ebx,1),%xmm2\n"
- "movdqa 0x10(%esi,%ebx,1),%xmm3\n"
- "pavgb %xmm2,%xmm0\n"
- "pavgb %xmm3,%xmm1\n"
- "movdqa (%esi,%ebx,2),%xmm2\n"
- "movdqa 0x10(%esi,%ebx,2),%xmm3\n"
- "movdqa (%esi,%edx,1),%xmm4\n"
- "movdqa 0x10(%esi,%edx,1),%xmm5\n"
- "lea (%esi,%ebx,4),%ebp\n"
- "lea 0x20(%esi),%esi\n"
- "pavgb %xmm4,%xmm2\n"
- "pavgb %xmm5,%xmm3\n"
- "pavgb %xmm2,%xmm0\n"
- "pavgb %xmm3,%xmm1\n"
- "movdqa 0x0(%ebp),%xmm2\n"
- "movdqa 0x10(%ebp),%xmm3\n"
- "movdqa 0x0(%ebp,%ebx,1),%xmm4\n"
- "movdqa 0x10(%ebp,%ebx,1),%xmm5\n"
- "pavgb %xmm4,%xmm2\n"
- "pavgb %xmm5,%xmm3\n"
- "movdqa 0x0(%ebp,%ebx,2),%xmm4\n"
- "movdqa 0x10(%ebp,%ebx,2),%xmm5\n"
- "movdqa 0x0(%ebp,%edx,1),%xmm6\n"
- "pavgb %xmm6,%xmm4\n"
- "movdqa 0x10(%ebp,%edx,1),%xmm6\n"
- "pavgb %xmm6,%xmm5\n"
- "pavgb %xmm4,%xmm2\n"
- "pavgb %xmm5,%xmm3\n"
- "pavgb %xmm2,%xmm0\n"
- "pavgb %xmm3,%xmm1\n"
- "psadbw %xmm7,%xmm0\n"
- "psadbw %xmm7,%xmm1\n"
- "pshufd $0xd8,%xmm0,%xmm0\n"
- "pshufd $0x8d,%xmm1,%xmm1\n"
- "por %xmm1,%xmm0\n"
- "psrlw $0x3,%xmm0\n"
- "packuswb %xmm0,%xmm0\n"
- "packuswb %xmm0,%xmm0\n"
- "movd %xmm0,(%edi)\n"
- "lea 0x4(%edi),%edi\n"
- "sub $0x4,%ecx\n"
- "ja 1b\n"
- "popa\n"
- "ret\n"
-);
-
-// fpic is used for magiccam plugin
-#if !defined(__PIC__)
-#define HAS_SCALEROWDOWN34_SSSE3
-extern "C" void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width);
- asm(
- ".text\n"
-#if defined(OSX)
- ".globl _ScaleRowDown34_SSSE3\n"
-"_ScaleRowDown34_SSSE3:\n"
-#else
- ".global ScaleRowDown34_SSSE3\n"
-"ScaleRowDown34_SSSE3:\n"
-#endif
- "pusha\n"
- "mov 0x24(%esp),%esi\n"
- "mov 0x2c(%esp),%edi\n"
- "mov 0x30(%esp),%ecx\n"
- "movdqa _shuf0,%xmm3\n"
- "movdqa _shuf1,%xmm4\n"
- "movdqa _shuf2,%xmm5\n"
-
-"1:"
- "movdqa (%esi),%xmm0\n"
- "movdqa 0x10(%esi),%xmm2\n"
- "lea 0x20(%esi),%esi\n"
- "movdqa %xmm2,%xmm1\n"
- "palignr $0x8,%xmm0,%xmm1\n"
- "pshufb %xmm3,%xmm0\n"
- "pshufb %xmm4,%xmm1\n"
- "pshufb %xmm5,%xmm2\n"
- "movq %xmm0,(%edi)\n"
- "movq %xmm1,0x8(%edi)\n"
- "movq %xmm2,0x10(%edi)\n"
- "lea 0x18(%edi),%edi\n"
- "sub $0x18,%ecx\n"
- "ja 1b\n"
- "popa\n"
- "ret\n"
-);
-
-extern "C" void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width);
- asm(
- ".text\n"
-#if defined(OSX)
- ".globl _ScaleRowDown34_1_Int_SSSE3\n"
-"_ScaleRowDown34_1_Int_SSSE3:\n"
-#else
- ".global ScaleRowDown34_1_Int_SSSE3\n"
-"ScaleRowDown34_1_Int_SSSE3:\n"
-#endif
- "pusha\n"
- "mov 0x24(%esp),%esi\n"
- "mov 0x28(%esp),%ebp\n"
- "mov 0x2c(%esp),%edi\n"
- "mov 0x30(%esp),%ecx\n"
- "movdqa _shuf01,%xmm2\n"
- "movdqa _shuf11,%xmm3\n"
- "movdqa _shuf21,%xmm4\n"
- "movdqa _madd01,%xmm5\n"
- "movdqa _madd11,%xmm6\n"
- "movdqa _round34,%xmm7\n"
-
-"1:"
- "movdqa (%esi),%xmm0\n"
- "movdqa (%esi,%ebp),%xmm1\n"
- "pavgb %xmm1,%xmm0\n"
- "pshufb %xmm2,%xmm0\n"
- "pmaddubsw %xmm5,%xmm0\n"
- "paddsw %xmm7,%xmm0\n"
- "psrlw $0x2,%xmm0\n"
- "packuswb %xmm0,%xmm0\n"
- "movq %xmm0,(%edi)\n"
- "movdqu 0x8(%esi),%xmm0\n"
- "movdqu 0x8(%esi,%ebp),%xmm1\n"
- "pavgb %xmm1,%xmm0\n"
- "pshufb %xmm3,%xmm0\n"
- "pmaddubsw %xmm6,%xmm0\n"
- "paddsw %xmm7,%xmm0\n"
- "psrlw $0x2,%xmm0\n"
- "packuswb %xmm0,%xmm0\n"
- "movq %xmm0,0x8(%edi)\n"
- "movdqa 0x10(%esi),%xmm0\n"
- "movdqa 0x10(%esi,%ebp),%xmm1\n"
- "lea 0x20(%esi),%esi\n"
- "pavgb %xmm1,%xmm0\n"
- "pshufb %xmm4,%xmm0\n"
- "movdqa _madd21,%xmm1\n"
- "pmaddubsw %xmm1,%xmm0\n"
- "paddsw %xmm7,%xmm0\n"
- "psrlw $0x2,%xmm0\n"
- "packuswb %xmm0,%xmm0\n"
- "movq %xmm0,0x10(%edi)\n"
- "lea 0x18(%edi),%edi\n"
- "sub $0x18,%ecx\n"
- "ja 1b\n"
-
- "popa\n"
- "ret\n"
-);
-
-extern "C" void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width);
- asm(
- ".text\n"
-#if defined(OSX)
- ".globl _ScaleRowDown34_0_Int_SSSE3\n"
-"_ScaleRowDown34_0_Int_SSSE3:\n"
-#else
- ".global ScaleRowDown34_0_Int_SSSE3\n"
-"ScaleRowDown34_0_Int_SSSE3:\n"
-#endif
- "pusha\n"
- "mov 0x24(%esp),%esi\n"
- "mov 0x28(%esp),%ebp\n"
- "mov 0x2c(%esp),%edi\n"
- "mov 0x30(%esp),%ecx\n"
- "movdqa _shuf01,%xmm2\n"
- "movdqa _shuf11,%xmm3\n"
- "movdqa _shuf21,%xmm4\n"
- "movdqa _madd01,%xmm5\n"
- "movdqa _madd11,%xmm6\n"
- "movdqa _round34,%xmm7\n"
-
-"1:"
- "movdqa (%esi),%xmm0\n"
- "movdqa (%esi,%ebp,1),%xmm1\n"
- "pavgb %xmm0,%xmm1\n"
- "pavgb %xmm1,%xmm0\n"
- "pshufb %xmm2,%xmm0\n"
- "pmaddubsw %xmm5,%xmm0\n"
- "paddsw %xmm7,%xmm0\n"
- "psrlw $0x2,%xmm0\n"
- "packuswb %xmm0,%xmm0\n"
- "movq %xmm0,(%edi)\n"
- "movdqu 0x8(%esi),%xmm0\n"
- "movdqu 0x8(%esi,%ebp,1),%xmm1\n"
- "pavgb %xmm0,%xmm1\n"
- "pavgb %xmm1,%xmm0\n"
- "pshufb %xmm3,%xmm0\n"
- "pmaddubsw %xmm6,%xmm0\n"
- "paddsw %xmm7,%xmm0\n"
- "psrlw $0x2,%xmm0\n"
- "packuswb %xmm0,%xmm0\n"
- "movq %xmm0,0x8(%edi)\n"
- "movdqa 0x10(%esi),%xmm0\n"
- "movdqa 0x10(%esi,%ebp,1),%xmm1\n"
- "lea 0x20(%esi),%esi\n"
- "pavgb %xmm0,%xmm1\n"
- "pavgb %xmm1,%xmm0\n"
- "pshufb %xmm4,%xmm0\n"
- "movdqa _madd21,%xmm1\n"
- "pmaddubsw %xmm1,%xmm0\n"
- "paddsw %xmm7,%xmm0\n"
- "psrlw $0x2,%xmm0\n"
- "packuswb %xmm0,%xmm0\n"
- "movq %xmm0,0x10(%edi)\n"
- "lea 0x18(%edi),%edi\n"
- "sub $0x18,%ecx\n"
- "ja 1b\n"
- "popa\n"
- "ret\n"
-);
-
-#define HAS_SCALEROWDOWN38_SSSE3
-extern "C" void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width);
- asm(
- ".text\n"
-#if defined(OSX)
- ".globl _ScaleRowDown38_SSSE3\n"
-"_ScaleRowDown38_SSSE3:\n"
-#else
- ".global ScaleRowDown38_SSSE3\n"
-"ScaleRowDown38_SSSE3:\n"
-#endif
- "pusha\n"
- "mov 0x24(%esp),%esi\n"
- "mov 0x28(%esp),%edx\n"
- "mov 0x2c(%esp),%edi\n"
- "mov 0x30(%esp),%ecx\n"
- "movdqa _shuf38a ,%xmm5\n"
- "movdqa _shuf38b ,%xmm6\n"
- "pxor %xmm7,%xmm7\n"
-
-"1:"
- "movdqa (%esi),%xmm0\n"
- "movdqa 0x10(%esi),%xmm1\n"
- "lea 0x20(%esi),%esi\n"
- "pshufb %xmm5,%xmm0\n"
- "pshufb %xmm6,%xmm1\n"
- "paddusb %xmm1,%xmm0\n"
- "movq %xmm0,(%edi)\n"
- "movhlps %xmm0,%xmm1\n"
- "movd %xmm1,0x8(%edi)\n"
- "lea 0xc(%edi),%edi\n"
- "sub $0xc,%ecx\n"
- "ja 1b\n"
- "popa\n"
- "ret\n"
-);
-
-extern "C" void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width);
- asm(
- ".text\n"
-#if defined(OSX)
- ".globl _ScaleRowDown38_3_Int_SSSE3\n"
-"_ScaleRowDown38_3_Int_SSSE3:\n"
-#else
- ".global ScaleRowDown38_3_Int_SSSE3\n"
-"ScaleRowDown38_3_Int_SSSE3:\n"
-#endif
- "pusha\n"
- "mov 0x24(%esp),%esi\n"
- "mov 0x28(%esp),%edx\n"
- "mov 0x2c(%esp),%edi\n"
- "mov 0x30(%esp),%ecx\n"
- "movdqa _shufac0,%xmm4\n"
- "movdqa _shufac3,%xmm5\n"
- "movdqa _scaleac3,%xmm6\n"
- "pxor %xmm7,%xmm7\n"
-
-"1:"
- "movdqa (%esi),%xmm0\n"
- "movdqa (%esi,%edx,1),%xmm2\n"
- "movhlps %xmm0,%xmm1\n"
- "movhlps %xmm2,%xmm3\n"
- "punpcklbw %xmm7,%xmm0\n"
- "punpcklbw %xmm7,%xmm1\n"
- "punpcklbw %xmm7,%xmm2\n"
- "punpcklbw %xmm7,%xmm3\n"
- "paddusw %xmm2,%xmm0\n"
- "paddusw %xmm3,%xmm1\n"
- "movdqa (%esi,%edx,2),%xmm2\n"
- "lea 0x10(%esi),%esi\n"
- "movhlps %xmm2,%xmm3\n"
- "punpcklbw %xmm7,%xmm2\n"
- "punpcklbw %xmm7,%xmm3\n"
- "paddusw %xmm2,%xmm0\n"
- "paddusw %xmm3,%xmm1\n"
- "movdqa %xmm0,%xmm2\n"
- "psrldq $0x2,%xmm0\n"
- "paddusw %xmm0,%xmm2\n"
- "psrldq $0x2,%xmm0\n"
- "paddusw %xmm0,%xmm2\n"
- "pshufb %xmm4,%xmm2\n"
- "movdqa %xmm1,%xmm3\n"
- "psrldq $0x2,%xmm1\n"
- "paddusw %xmm1,%xmm3\n"
- "psrldq $0x2,%xmm1\n"
- "paddusw %xmm1,%xmm3\n"
- "pshufb %xmm5,%xmm3\n"
- "paddusw %xmm3,%xmm2\n"
- "pmulhuw %xmm6,%xmm2\n"
- "packuswb %xmm2,%xmm2\n"
- "movd %xmm2,(%edi)\n"
- "pextrw $0x2,%xmm2,%eax\n"
- "mov %ax,0x4(%edi)\n"
- "lea 0x6(%edi),%edi\n"
- "sub $0x6,%ecx\n"
- "ja 1b\n"
- "popa\n"
- "ret\n"
-);
-
-extern "C" void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width);
- asm(
- ".text\n"
-#if defined(OSX)
- ".globl _ScaleRowDown38_2_Int_SSSE3\n"
-"_ScaleRowDown38_2_Int_SSSE3:\n"
-#else
- ".global ScaleRowDown38_2_Int_SSSE3\n"
-"ScaleRowDown38_2_Int_SSSE3:\n"
-#endif
- "pusha\n"
- "mov 0x24(%esp),%esi\n"
- "mov 0x28(%esp),%edx\n"
- "mov 0x2c(%esp),%edi\n"
- "mov 0x30(%esp),%ecx\n"
- "movdqa _shufab0,%xmm4\n"
- "movdqa _shufab1,%xmm5\n"
- "movdqa _shufab2,%xmm6\n"
- "movdqa _scaleab2,%xmm7\n"
-
-"1:"
- "movdqa (%esi),%xmm2\n"
- "pavgb (%esi,%edx,1),%xmm2\n"
- "lea 0x10(%esi),%esi\n"
- "movdqa %xmm2,%xmm0\n"
- "pshufb %xmm4,%xmm0\n"
- "movdqa %xmm2,%xmm1\n"
- "pshufb %xmm5,%xmm1\n"
- "paddusw %xmm1,%xmm0\n"
- "pshufb %xmm6,%xmm2\n"
- "paddusw %xmm2,%xmm0\n"
- "pmulhuw %xmm7,%xmm0\n"
- "packuswb %xmm0,%xmm0\n"
- "movd %xmm0,(%edi)\n"
- "pextrw $0x2,%xmm0,%eax\n"
- "mov %ax,0x4(%edi)\n"
- "lea 0x6(%edi),%edi\n"
- "sub $0x6,%ecx\n"
- "ja 1b\n"
- "popa\n"
- "ret\n"
-);
-#endif // __PIC__
-
-#define HAS_SCALEADDROWS_SSE2
-extern "C" void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
- uint16* dst_ptr, int src_width,
- int src_height);
- asm(
- ".text\n"
-#if defined(OSX)
- ".globl _ScaleAddRows_SSE2\n"
-"_ScaleAddRows_SSE2:\n"
-#else
- ".global ScaleAddRows_SSE2\n"
-"ScaleAddRows_SSE2:\n"
-#endif
- "pusha\n"
- "mov 0x24(%esp),%esi\n"
- "mov 0x28(%esp),%edx\n"
- "mov 0x2c(%esp),%edi\n"
- "mov 0x30(%esp),%ecx\n"
- "mov 0x34(%esp),%ebx\n"
- "pxor %xmm7,%xmm7\n"
-
-"1:"
- "movdqa (%esi),%xmm2\n"
- "lea (%esi,%edx,1),%eax\n"
- "movhlps %xmm2,%xmm3\n"
- "lea -0x1(%ebx),%ebp\n"
- "punpcklbw %xmm7,%xmm2\n"
- "punpcklbw %xmm7,%xmm3\n"
-
-"2:"
- "movdqa (%eax),%xmm0\n"
- "lea (%eax,%edx,1),%eax\n"
- "movhlps %xmm0,%xmm1\n"
- "punpcklbw %xmm7,%xmm0\n"
- "punpcklbw %xmm7,%xmm1\n"
- "paddusw %xmm0,%xmm2\n"
- "paddusw %xmm1,%xmm3\n"
- "sub $0x1,%ebp\n"
- "ja 2b\n"
-
- "movdqa %xmm2,(%edi)\n"
- "movdqa %xmm3,0x10(%edi)\n"
- "lea 0x20(%edi),%edi\n"
- "lea 0x10(%esi),%esi\n"
- "sub $0x10,%ecx\n"
- "ja 1b\n"
- "popa\n"
- "ret\n"
-);
-
-// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
-#define HAS_SCALEFILTERROWS_SSE2
-extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr,
- const uint8* src_ptr, int src_stride,
- int dst_width, int source_y_fraction);
- asm(
- ".text\n"
-#if defined(OSX)
- ".globl _ScaleFilterRows_SSE2\n"
-"_ScaleFilterRows_SSE2:\n"
-#else
- ".global ScaleFilterRows_SSE2\n"
-"ScaleFilterRows_SSE2:\n"
-#endif
- "push %esi\n"
- "push %edi\n"
- "mov 0xc(%esp),%edi\n"
- "mov 0x10(%esp),%esi\n"
- "mov 0x14(%esp),%edx\n"
- "mov 0x18(%esp),%ecx\n"
- "mov 0x1c(%esp),%eax\n"
- "cmp $0x0,%eax\n"
- "je 2f\n"
- "cmp $0x80,%eax\n"
- "je 3f\n"
- "movd %eax,%xmm6\n"
- "punpcklwd %xmm6,%xmm6\n"
- "pshufd $0x0,%xmm6,%xmm6\n"
- "neg %eax\n"
- "add $0x100,%eax\n"
- "movd %eax,%xmm5\n"
- "punpcklwd %xmm5,%xmm5\n"
- "pshufd $0x0,%xmm5,%xmm5\n"
- "pxor %xmm7,%xmm7\n"
-
-"1:"
- "movdqa (%esi),%xmm0\n"
- "movdqa (%esi,%edx,1),%xmm2\n"
- "lea 0x10(%esi),%esi\n"
- "movdqa %xmm0,%xmm1\n"
- "movdqa %xmm2,%xmm3\n"
- "punpcklbw %xmm7,%xmm0\n"
- "punpcklbw %xmm7,%xmm2\n"
- "punpckhbw %xmm7,%xmm1\n"
- "punpckhbw %xmm7,%xmm3\n"
- "pmullw %xmm5,%xmm0\n"
- "pmullw %xmm5,%xmm1\n"
- "pmullw %xmm6,%xmm2\n"
- "pmullw %xmm6,%xmm3\n"
- "paddusw %xmm2,%xmm0\n"
- "paddusw %xmm3,%xmm1\n"
- "psrlw $0x8,%xmm0\n"
- "psrlw $0x8,%xmm1\n"
- "packuswb %xmm1,%xmm0\n"
- "movdqa %xmm0,(%edi)\n"
- "lea 0x10(%edi),%edi\n"
- "sub $0x10,%ecx\n"
- "ja 1b\n"
- "mov -0x1(%edi),%al\n"
- "mov %al,(%edi)\n"
- "pop %edi\n"
- "pop %esi\n"
- "ret\n"
-
-"2:"
- "movdqa (%esi),%xmm0\n"
- "lea 0x10(%esi),%esi\n"
- "movdqa %xmm0,(%edi)\n"
- "lea 0x10(%edi),%edi\n"
- "sub $0x10,%ecx\n"
- "ja 2b\n"
-
- "mov -0x1(%edi),%al\n"
- "mov %al,(%edi)\n"
- "pop %edi\n"
- "pop %esi\n"
- "ret\n"
-
-"3:"
- "movdqa (%esi),%xmm0\n"
- "movdqa (%esi,%edx,1),%xmm2\n"
- "lea 0x10(%esi),%esi\n"
- "pavgb %xmm2,%xmm0\n"
- "movdqa %xmm0,(%edi)\n"
- "lea 0x10(%edi),%edi\n"
- "sub $0x10,%ecx\n"
- "ja 3b\n"
-
- "mov -0x1(%edi),%al\n"
- "mov %al,(%edi)\n"
- "pop %edi\n"
- "pop %esi\n"
- "ret\n"
-);
+ );
+}
-// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version
-#define HAS_SCALEFILTERROWS_SSSE3
-extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
- const uint8* src_ptr, int src_stride,
- int dst_width, int source_y_fraction);
- asm(
- ".text\n"
-#if defined(OSX)
- ".globl _ScaleFilterRows_SSSE3\n"
-"_ScaleFilterRows_SSSE3:\n"
-#else
- ".global ScaleFilterRows_SSSE3\n"
-"ScaleFilterRows_SSSE3:\n"
-#endif
- "push %esi\n"
- "push %edi\n"
- "mov 0xc(%esp),%edi\n"
- "mov 0x10(%esp),%esi\n"
- "mov 0x14(%esp),%edx\n"
- "mov 0x18(%esp),%ecx\n"
- "mov 0x1c(%esp),%eax\n"
- "cmp $0x0,%eax\n"
- "je 2f\n"
- "cmp $0x80,%eax\n"
- "je 3f\n"
- "shr %eax\n"
- "mov %al,%ah\n"
- "neg %al\n"
- "add $0x80,%al\n"
- "movd %eax,%xmm7\n"
- "punpcklwd %xmm7,%xmm7\n"
- "pshufd $0x0,%xmm7,%xmm7\n"
-
-"1:"
- "movdqa (%esi),%xmm0\n"
- "movdqa (%esi,%edx,1),%xmm2\n"
- "lea 0x10(%esi),%esi\n"
- "movdqa %xmm0,%xmm1\n"
- "punpcklbw %xmm2,%xmm0\n"
- "punpckhbw %xmm2,%xmm1\n"
- "pmaddubsw %xmm7,%xmm0\n"
- "pmaddubsw %xmm7,%xmm1\n"
- "psrlw $0x7,%xmm0\n"
- "psrlw $0x7,%xmm1\n"
- "packuswb %xmm1,%xmm0\n"
- "movdqa %xmm0,(%edi)\n"
- "lea 0x10(%edi),%edi\n"
- "sub $0x10,%ecx\n"
- "ja 1b\n"
- "mov -0x1(%edi),%al\n"
- "mov %al,(%edi)\n"
- "pop %edi\n"
- "pop %esi\n"
- "ret\n"
-
-"2:"
- "movdqa (%esi),%xmm0\n"
- "lea 0x10(%esi),%esi\n"
- "movdqa %xmm0,(%edi)\n"
- "lea 0x10(%edi),%edi\n"
- "sub $0x10,%ecx\n"
- "ja 2b\n"
- "mov -0x1(%edi),%al\n"
- "mov %al,(%edi)\n"
- "pop %edi\n"
- "pop %esi\n"
- "ret\n"
-
-"3:"
- "movdqa (%esi),%xmm0\n"
- "movdqa (%esi,%edx,1),%xmm2\n"
- "lea 0x10(%esi),%esi\n"
- "pavgb %xmm2,%xmm0\n"
- "movdqa %xmm0,(%edi)\n"
- "lea 0x10(%edi),%edi\n"
- "sub $0x10,%ecx\n"
- "ja 3b\n"
- "mov -0x1(%edi),%al\n"
- "mov %al,(%edi)\n"
- "pop %edi\n"
- "pop %esi\n"
- "ret\n"
-);
-
-#elif defined(__x86_64__)
-static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
- asm volatile(
- "lea (%3,%3,2),%%r10\n"
- "pxor %%xmm7,%%xmm7\n"
-"1:"
- "movdqa (%0),%%xmm0\n"
- "movdqa 0x10(%0),%%xmm1\n"
- "movdqa (%0,%3,1),%%xmm2\n"
- "movdqa 0x10(%0,%3,1),%%xmm3\n"
- "pavgb %%xmm2,%%xmm0\n"
- "pavgb %%xmm3,%%xmm1\n"
- "movdqa (%0,%3,2),%%xmm2\n"
- "movdqa 0x10(%0,%3,2),%%xmm3\n"
- "movdqa (%0,%%r10,1),%%xmm4\n"
- "movdqa 0x10(%0,%%r10,1),%%xmm5\n"
- "lea (%0,%3,4),%%r11\n"
- "lea 0x20(%0),%0\n"
- "pavgb %%xmm4,%%xmm2\n"
- "pavgb %%xmm5,%%xmm3\n"
- "pavgb %%xmm2,%%xmm0\n"
- "pavgb %%xmm3,%%xmm1\n"
- "movdqa 0x0(%%r11),%%xmm2\n"
- "movdqa 0x10(%%r11),%%xmm3\n"
- "movdqa 0x0(%%r11,%3,1),%%xmm4\n"
- "movdqa 0x10(%%r11,%3,1),%%xmm5\n"
- "pavgb %%xmm4,%%xmm2\n"
- "pavgb %%xmm5,%%xmm3\n"
- "movdqa 0x0(%%r11,%3,2),%%xmm4\n"
- "movdqa 0x10(%%r11,%3,2),%%xmm5\n"
- "movdqa 0x0(%%r11,%%r10,1),%%xmm6\n"
- "pavgb %%xmm6,%%xmm4\n"
- "movdqa 0x10(%%r11,%%r10,1),%%xmm6\n"
- "pavgb %%xmm6,%%xmm5\n"
- "pavgb %%xmm4,%%xmm2\n"
- "pavgb %%xmm5,%%xmm3\n"
- "pavgb %%xmm2,%%xmm0\n"
- "pavgb %%xmm3,%%xmm1\n"
- "psadbw %%xmm7,%%xmm0\n"
- "psadbw %%xmm7,%%xmm1\n"
- "pshufd $0xd8,%%xmm0,%%xmm0\n"
- "pshufd $0x8d,%%xmm1,%%xmm1\n"
- "por %%xmm1,%%xmm0\n"
- "psrlw $0x3,%%xmm0\n"
- "packuswb %%xmm0,%%xmm0\n"
- "packuswb %%xmm0,%%xmm0\n"
- "movd %%xmm0,(%1)\n"
- "lea 0x4(%1),%1\n"
- "sub $0x4,%2\n"
- "ja 1b\n"
+ intptr_t stridex3 = 0;
+ intptr_t row4 = 0;
+ asm volatile (
+ "lea (%5,%5,2),%3 \n"
+ "pxor %%xmm7,%%xmm7 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa (%0,%5,1),%%xmm2 \n"
+ "movdqa 0x10(%0,%5,1),%%xmm3 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa (%0,%5,2),%%xmm2 \n"
+ "movdqa 0x10(%0,%5,2),%%xmm3 \n"
+ "movdqa (%0,%3,1),%%xmm4 \n"
+ "movdqa 0x10(%0,%3,1),%%xmm5 \n"
+ "lea (%0,%5,4),%4 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm4,%%xmm2 \n"
+ "pavgb %%xmm5,%%xmm3 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa 0x0(%4),%%xmm2 \n"
+ "movdqa 0x10(%4),%%xmm3 \n"
+ "movdqa 0x0(%4,%5,1),%%xmm4 \n"
+ "movdqa 0x10(%4,%5,1),%%xmm5 \n"
+ "pavgb %%xmm4,%%xmm2 \n"
+ "pavgb %%xmm5,%%xmm3 \n"
+ "movdqa 0x0(%4,%5,2),%%xmm4 \n"
+ "movdqa 0x10(%4,%5,2),%%xmm5 \n"
+ "movdqa 0x0(%4,%3,1),%%xmm6 \n"
+ "pavgb %%xmm6,%%xmm4 \n"
+ "movdqa 0x10(%4,%3,1),%%xmm6 \n"
+ "pavgb %%xmm6,%%xmm5 \n"
+ "pavgb %%xmm4,%%xmm2 \n"
+ "pavgb %%xmm5,%%xmm3 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "psadbw %%xmm7,%%xmm0 \n"
+ "psadbw %%xmm7,%%xmm1 \n"
+ "pshufd $0xd8,%%xmm0,%%xmm0 \n"
+ "pshufd $0x8d,%%xmm1,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "psrlw $0x3,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0,(%1) \n"
+ "lea 0x4(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"(static_cast<intptr_t>(src_stride)) // %3
- : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3",
- "xmm4", "xmm5", "xmm6", "xmm7"
-);
+ "+rm"(dst_width), // %2
+ "+r"(stridex3), // %3
+ "+r"(row4) // %4
+ : "r"(static_cast<intptr_t>(src_stride)) // %5
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
}
#define HAS_SCALEROWDOWN34_SSSE3
-static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
- asm volatile(
- "movdqa (%3),%%xmm3\n"
- "movdqa (%4),%%xmm4\n"
- "movdqa (%5),%%xmm5\n"
-"1:"
- "movdqa (%0),%%xmm0\n"
- "movdqa 0x10(%0),%%xmm2\n"
- "lea 0x20(%0),%0\n"
- "movdqa %%xmm2,%%xmm1\n"
- "palignr $0x8,%%xmm0,%%xmm1\n"
- "pshufb %%xmm3,%%xmm0\n"
- "pshufb %%xmm4,%%xmm1\n"
- "pshufb %%xmm5,%%xmm2\n"
- "movq %%xmm0,(%1)\n"
- "movq %%xmm1,0x8(%1)\n"
- "movq %%xmm2,0x10(%1)\n"
- "lea 0x18(%1),%1\n"
- "sub $0x18,%2\n"
- "ja 1b\n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"(_shuf0), // %3
- "r"(_shuf1), // %4
- "r"(_shuf2) // %5
- : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-);
+ asm volatile (
+ "movdqa %0,%%xmm3 \n"
+ "movdqa %1,%%xmm4 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kShuf0), // %0
+ "m"(kShuf1), // %1
+ "m"(kShuf2) // %2
+ );
+ asm volatile (
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm2 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "palignr $0x8,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm3,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x8(%1) \n"
+ "movq %%xmm2,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
}
-static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
- asm volatile(
- "movdqa (%4),%%xmm2\n" // _shuf01
- "movdqa (%5),%%xmm3\n" // _shuf11
- "movdqa (%6),%%xmm4\n" // _shuf21
- "movdqa (%7),%%xmm5\n" // _madd01
- "movdqa (%8),%%xmm6\n" // _madd11
- "movdqa (%9),%%xmm7\n" // _round34
- "movdqa (%10),%%xmm8\n" // _madd21
-"1:"
- "movdqa (%0),%%xmm0\n"
- "movdqa (%0,%3),%%xmm1\n"
- "pavgb %%xmm1,%%xmm0\n"
- "pshufb %%xmm2,%%xmm0\n"
- "pmaddubsw %%xmm5,%%xmm0\n"
- "paddsw %%xmm7,%%xmm0\n"
- "psrlw $0x2,%%xmm0\n"
- "packuswb %%xmm0,%%xmm0\n"
- "movq %%xmm0,(%1)\n"
- "movdqu 0x8(%0),%%xmm0\n"
- "movdqu 0x8(%0,%3),%%xmm1\n"
- "pavgb %%xmm1,%%xmm0\n"
- "pshufb %%xmm3,%%xmm0\n"
- "pmaddubsw %%xmm6,%%xmm0\n"
- "paddsw %%xmm7,%%xmm0\n"
- "psrlw $0x2,%%xmm0\n"
- "packuswb %%xmm0,%%xmm0\n"
- "movq %%xmm0,0x8(%1)\n"
- "movdqa 0x10(%0),%%xmm0\n"
- "movdqa 0x10(%0,%3),%%xmm1\n"
- "lea 0x20(%0),%0\n"
- "pavgb %%xmm1,%%xmm0\n"
- "pshufb %%xmm4,%%xmm0\n"
- "pmaddubsw %%xmm8,%%xmm0\n"
- "paddsw %%xmm7,%%xmm0\n"
- "psrlw $0x2,%%xmm0\n"
- "packuswb %%xmm0,%%xmm0\n"
- "movq %%xmm0,0x10(%1)\n"
- "lea 0x18(%1),%1\n"
- "sub $0x18,%2\n"
- "ja 1b\n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
+ asm volatile (
+ "movdqa %0,%%xmm2 \n" // kShuf01
+ "movdqa %1,%%xmm3 \n" // kShuf11
+ "movdqa %2,%%xmm4 \n" // kShuf21
+ :
+ : "m"(kShuf01), // %0
+ "m"(kShuf11), // %1
+ "m"(kShuf21) // %2
+ );
+ asm volatile (
+ "movdqa %0,%%xmm5 \n" // kMadd01
+ "movdqa %1,%%xmm0 \n" // kMadd11
+ "movdqa %2,%%xmm1 \n" // kRound34
+ :
+ : "m"(kMadd01), // %0
+ "m"(kMadd11), // %1
+ "m"(kRound34) // %2
+ );
+ asm volatile (
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm6 \n"
+ "movdqa (%0,%3),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,(%1) \n"
+ "movdqu 0x8(%0),%%xmm6 \n"
+ "movdqu 0x8(%0,%3),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm0,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x8(%1) \n"
+ "movdqa 0x10(%0),%%xmm6 \n"
+ "movdqa 0x10(%0,%3),%%xmm7 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm4,%%xmm6 \n"
+ "pmaddubsw %4,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
: "r"(static_cast<intptr_t>(src_stride)), // %3
- "r"(_shuf01), // %4
- "r"(_shuf11), // %5
- "r"(_shuf21), // %6
- "r"(_madd01), // %7
- "r"(_madd11), // %8
- "r"(_round34), // %9
- "r"(_madd21) // %10
- : "memory", "xmm0", "xmm1", "xmm2", "xmm3",
- "xmm4", "xmm5", "xmm6", "xmm7", "xmm8"
-);
-}
-
-static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
+ "m"(kMadd21) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+
+static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
- asm volatile(
- "movdqa (%4),%%xmm2\n" // _shuf01
- "movdqa (%5),%%xmm3\n" // _shuf11
- "movdqa (%6),%%xmm4\n" // _shuf21
- "movdqa (%7),%%xmm5\n" // _madd01
- "movdqa (%8),%%xmm6\n" // _madd11
- "movdqa (%9),%%xmm7\n" // _round34
- "movdqa (%10),%%xmm8\n" // _madd21
-"1:"
- "movdqa (%0),%%xmm0\n"
- "movdqa (%0,%3,1),%%xmm1\n"
- "pavgb %%xmm0,%%xmm1\n"
- "pavgb %%xmm1,%%xmm0\n"
- "pshufb %%xmm2,%%xmm0\n"
- "pmaddubsw %%xmm5,%%xmm0\n"
- "paddsw %%xmm7,%%xmm0\n"
- "psrlw $0x2,%%xmm0\n"
- "packuswb %%xmm0,%%xmm0\n"
- "movq %%xmm0,(%1)\n"
- "movdqu 0x8(%0),%%xmm0\n"
- "movdqu 0x8(%0,%3,1),%%xmm1\n"
- "pavgb %%xmm0,%%xmm1\n"
- "pavgb %%xmm1,%%xmm0\n"
- "pshufb %%xmm3,%%xmm0\n"
- "pmaddubsw %%xmm6,%%xmm0\n"
- "paddsw %%xmm7,%%xmm0\n"
- "psrlw $0x2,%%xmm0\n"
- "packuswb %%xmm0,%%xmm0\n"
- "movq %%xmm0,0x8(%1)\n"
- "movdqa 0x10(%0),%%xmm0\n"
- "movdqa 0x10(%0,%3,1),%%xmm1\n"
- "lea 0x20(%0),%0\n"
- "pavgb %%xmm0,%%xmm1\n"
- "pavgb %%xmm1,%%xmm0\n"
- "pshufb %%xmm4,%%xmm0\n"
- "pmaddubsw %%xmm8,%%xmm0\n"
- "paddsw %%xmm7,%%xmm0\n"
- "psrlw $0x2,%%xmm0\n"
- "packuswb %%xmm0,%%xmm0\n"
- "movq %%xmm0,0x10(%1)\n"
- "lea 0x18(%1),%1\n"
- "sub $0x18,%2\n"
- "ja 1b\n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"(static_cast<intptr_t>(src_stride)), // %3
- "r"(_shuf01), // %4
- "r"(_shuf11), // %5
- "r"(_shuf21), // %6
- "r"(_madd01), // %7
- "r"(_madd11), // %8
- "r"(_round34), // %9
- "r"(_madd21) // %10
- : "memory", "xmm0", "xmm1", "xmm2", "xmm3",
- "xmm4", "xmm5", "xmm6", "xmm7", "xmm8"
-);
+ asm volatile (
+ "movdqa %0,%%xmm2 \n" // kShuf01
+ "movdqa %1,%%xmm3 \n" // kShuf11
+ "movdqa %2,%%xmm4 \n" // kShuf21
+ :
+ : "m"(kShuf01), // %0
+ "m"(kShuf11), // %1
+ "m"(kShuf21) // %2
+ );
+ asm volatile (
+ "movdqa %0,%%xmm5 \n" // kMadd01
+ "movdqa %1,%%xmm0 \n" // kMadd11
+ "movdqa %2,%%xmm1 \n" // kRound34
+ :
+ : "m"(kMadd01), // %0
+ "m"(kMadd11), // %1
+ "m"(kRound34) // %2
+ );
+
+ asm volatile (
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm6 \n"
+ "movdqa (%0,%3,1),%%xmm7 \n"
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,(%1) \n"
+ "movdqu 0x8(%0),%%xmm6 \n"
+ "movdqu 0x8(%0,%3,1),%%xmm7 \n"
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm0,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x8(%1) \n"
+ "movdqa 0x10(%0),%%xmm6 \n"
+ "movdqa 0x10(%0,%3,1),%%xmm7 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm4,%%xmm6 \n"
+ "pmaddubsw %4,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(static_cast<intptr_t>(src_stride)), // %3
+ "m"(kMadd21) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
}
#define HAS_SCALEROWDOWN38_SSSE3
-static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
- asm volatile(
- "movdqa (%3),%%xmm5\n"
- "movdqa (%4),%%xmm6\n"
- "pxor %%xmm7,%%xmm7\n"
-"1:"
- "movdqa (%0),%%xmm0\n"
- "movdqa 0x10(%0),%%xmm1\n"
- "lea 0x20(%0),%0\n"
- "pshufb %%xmm5,%%xmm0\n"
- "pshufb %%xmm6,%%xmm1\n"
- "paddusb %%xmm1,%%xmm0\n"
- "movq %%xmm0,(%1)\n"
- "movhlps %%xmm0,%%xmm1\n"
- "movd %%xmm1,0x8(%1)\n"
- "lea 0xc(%1),%1\n"
- "sub $0xc,%2\n"
- "ja 1b\n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"(_shuf38a), // %3
- "r"(_shuf38b) // %4
- : "memory", "xmm0", "xmm1", "xmm5", "xmm6", "xmm7"
-);
+ asm volatile (
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movhlps %%xmm0,%%xmm1 \n"
+ "movd %%xmm1,0x8(%1) \n"
+ "lea 0xc(%1),%1 \n"
+ "sub $0xc,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "m"(kShuf38a), // %3
+ "m"(kShuf38b) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm4", "xmm5"
+#endif
+ );
}
-static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
- asm volatile(
- "movdqa (%4),%%xmm4\n"
- "movdqa (%5),%%xmm5\n"
- "movdqa (%6),%%xmm6\n"
- "pxor %%xmm7,%%xmm7\n"
-"1:"
- "movdqa (%0),%%xmm0\n"
- "movdqa (%0,%3,1),%%xmm2\n"
- "movhlps %%xmm0,%%xmm1\n"
- "movhlps %%xmm2,%%xmm3\n"
- "punpcklbw %%xmm7,%%xmm0\n"
- "punpcklbw %%xmm7,%%xmm1\n"
- "punpcklbw %%xmm7,%%xmm2\n"
- "punpcklbw %%xmm7,%%xmm3\n"
- "paddusw %%xmm2,%%xmm0\n"
- "paddusw %%xmm3,%%xmm1\n"
- "movdqa (%0,%3,2),%%xmm2\n"
- "lea 0x10(%0),%0\n"
- "movhlps %%xmm2,%%xmm3\n"
- "punpcklbw %%xmm7,%%xmm2\n"
- "punpcklbw %%xmm7,%%xmm3\n"
- "paddusw %%xmm2,%%xmm0\n"
- "paddusw %%xmm3,%%xmm1\n"
- "movdqa %%xmm0,%%xmm2\n"
- "psrldq $0x2,%%xmm0\n"
- "paddusw %%xmm0,%%xmm2\n"
- "psrldq $0x2,%%xmm0\n"
- "paddusw %%xmm0,%%xmm2\n"
- "pshufb %%xmm4,%%xmm2\n"
- "movdqa %%xmm1,%%xmm3\n"
- "psrldq $0x2,%%xmm1\n"
- "paddusw %%xmm1,%%xmm3\n"
- "psrldq $0x2,%%xmm1\n"
- "paddusw %%xmm1,%%xmm3\n"
- "pshufb %%xmm5,%%xmm3\n"
- "paddusw %%xmm3,%%xmm2\n"
- "pmulhuw %%xmm6,%%xmm2\n"
- "packuswb %%xmm2,%%xmm2\n"
- "movd %%xmm2,(%1)\n"
- "pextrw $0x2,%%xmm2,%%eax\n"
- "mov %%ax,0x4(%1)\n"
- "lea 0x6(%1),%1\n"
- "sub $0x6,%2\n"
- "ja 1b\n"
+ asm volatile (
+ "movdqa %0,%%xmm2 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm4 \n"
+ "movdqa %3,%%xmm5 \n"
+ :
+ : "m"(kShufAb0), // %0
+ "m"(kShufAb1), // %1
+ "m"(kShufAb2), // %2
+ "m"(kScaleAb2) // %3
+ );
+ asm volatile (
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "pavgb (%0,%3,1),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pshufb %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "paddusw %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "sub $0x6,%2 \n"
+ "movd %%xmm1,(%1) \n"
+ "psrlq $0x10,%%xmm1 \n"
+ "movd %%xmm1,0x2(%1) \n"
+ "lea 0x6(%1),%1 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
- : "r"(static_cast<intptr_t>(src_stride)), // %3
- "r"(_shufac0), // %4
- "r"(_shufac3), // %5
- "r"(_scaleac3) // %6
- : "memory", "rax", "xmm0", "xmm1", "xmm2", "xmm3",
- "xmm4", "xmm5", "xmm6", "xmm7"
-);
+ : "r"(static_cast<intptr_t>(src_stride)) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+ );
}
-static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
- asm volatile(
- "movdqa (%4),%%xmm4\n"
- "movdqa (%5),%%xmm5\n"
- "movdqa (%6),%%xmm6\n"
- "movdqa (%7),%%xmm7\n"
-"1:"
- "movdqa (%0),%%xmm2\n"
- "pavgb (%0,%3,1),%%xmm2\n"
- "lea 0x10(%0),%0\n"
- "movdqa %%xmm2,%%xmm0\n"
- "pshufb %%xmm4,%%xmm0\n"
- "movdqa %%xmm2,%%xmm1\n"
- "pshufb %%xmm5,%%xmm1\n"
- "paddusw %%xmm1,%%xmm0\n"
- "pshufb %%xmm6,%%xmm2\n"
- "paddusw %%xmm2,%%xmm0\n"
- "pmulhuw %%xmm7,%%xmm0\n"
- "packuswb %%xmm0,%%xmm0\n"
- "movd %%xmm0,(%1)\n"
- "pextrw $0x2,%%xmm0,%%eax\n"
- "mov %%ax,0x4(%1)\n"
- "lea 0x6(%1),%1\n"
- "sub $0x6,%2\n"
- "ja 1b\n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"(static_cast<intptr_t>(src_stride)), // %3
- "r"(_shufab0), // %4
- "r"(_shufab1), // %5
- "r"(_shufab2), // %6
- "r"(_scaleab2) // %7
- : "memory", "rax", "xmm0", "xmm1", "xmm2",
- "xmm4", "xmm5", "xmm6", "xmm7"
-);
+ asm volatile (
+ "movdqa %0,%%xmm2 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+ :
+ : "m"(kShufAc), // %0
+ "m"(kShufAc3), // %1
+ "m"(kScaleAc33) // %2
+ );
+ asm volatile (
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa (%0,%3,1),%%xmm6 \n"
+ "movhlps %%xmm0,%%xmm1 \n"
+ "movhlps %%xmm6,%%xmm7 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm6 \n"
+ "punpcklbw %%xmm5,%%xmm7 \n"
+ "paddusw %%xmm6,%%xmm0 \n"
+ "paddusw %%xmm7,%%xmm1 \n"
+ "movdqa (%0,%3,2),%%xmm6 \n"
+ "lea 0x10(%0),%0 \n"
+ "movhlps %%xmm6,%%xmm7 \n"
+ "punpcklbw %%xmm5,%%xmm6 \n"
+ "punpcklbw %%xmm5,%%xmm7 \n"
+ "paddusw %%xmm6,%%xmm0 \n"
+ "paddusw %%xmm7,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "psrldq $0x2,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm6 \n"
+ "psrldq $0x2,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "movdqa %%xmm1,%%xmm7 \n"
+ "psrldq $0x2,%%xmm1 \n"
+ "paddusw %%xmm1,%%xmm7 \n"
+ "psrldq $0x2,%%xmm1 \n"
+ "paddusw %%xmm1,%%xmm7 \n"
+ "pshufb %%xmm3,%%xmm7 \n"
+ "paddusw %%xmm7,%%xmm6 \n"
+ "pmulhuw %%xmm4,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "sub $0x6,%2 \n"
+ "movd %%xmm6,(%1) \n"
+ "psrlq $0x10,%%xmm6 \n"
+ "movd %%xmm6,0x2(%1) \n"
+ "lea 0x6(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(static_cast<intptr_t>(src_stride)) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
}
#define HAS_SCALEADDROWS_SSE2
-static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
- uint16* dst_ptr, int src_width,
- int src_height) {
- asm volatile(
- "pxor %%xmm7,%%xmm7\n"
-"1:"
- "movdqa (%0),%%xmm2\n"
- "lea (%0,%4,1),%%r10\n"
- "movhlps %%xmm2,%%xmm3\n"
- "lea -0x1(%3),%%r11\n"
- "punpcklbw %%xmm7,%%xmm2\n"
- "punpcklbw %%xmm7,%%xmm3\n"
-
-"2:"
- "movdqa (%%r10),%%xmm0\n"
- "lea (%%r10,%4,1),%%r10\n"
- "movhlps %%xmm0,%%xmm1\n"
- "punpcklbw %%xmm7,%%xmm0\n"
- "punpcklbw %%xmm7,%%xmm1\n"
- "paddusw %%xmm0,%%xmm2\n"
- "paddusw %%xmm1,%%xmm3\n"
- "sub $0x1,%%r11\n"
- "ja 2b\n"
-
- "movdqa %%xmm2,(%1)\n"
- "movdqa %%xmm3,0x10(%1)\n"
- "lea 0x20(%1),%1\n"
- "lea 0x10(%0),%0\n"
- "sub $0x10,%2\n"
- "ja 1b\n"
+static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int src_width, int src_height) {
+ int tmp_height = 0;
+ intptr_t tmp_src = 0;
+ asm volatile (
+ "pxor %%xmm4,%%xmm4 \n"
+ "sub $0x1,%5 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "mov %0,%3 \n"
+ "add %6,%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm4,%%xmm0 \n"
+ "punpckhbw %%xmm4,%%xmm1 \n"
+ "mov %5,%2 \n"
+ "test %2,%2 \n"
+ "je 3f \n"
+ "2: \n"
+ "movdqa (%0),%%xmm2 \n"
+ "add %6,%0 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklbw %%xmm4,%%xmm2 \n"
+ "punpckhbw %%xmm4,%%xmm3 \n"
+ "paddusw %%xmm2,%%xmm0 \n"
+ "paddusw %%xmm3,%%xmm1 \n"
+ "sub $0x1,%2 \n"
+ "jg 2b \n"
+ "3: \n"
+ "movdqa %%xmm0,(%1) \n"
+ "movdqa %%xmm1,0x10(%1) \n"
+ "lea 0x10(%3),%0 \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
- "+r"(src_width), // %2
- "+r"(src_height) // %3
- : "r"(static_cast<intptr_t>(src_stride)) // %4
- : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7"
-);
+ "+r"(tmp_height), // %2
+ "+r"(tmp_src), // %3
+ "+r"(src_width), // %4
+ "+rm"(src_height) // %5
+ : "rm"(static_cast<intptr_t>(src_stride)) // %6
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+#endif
+ );
}
+#ifndef SSE2_DISABLED
// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
-#define HAS_SCALEFILTERROWS_SSE2
+#define HAS_SCALEFILTERROWS_SSE2_DISABLED
static void ScaleFilterRows_SSE2(uint8* dst_ptr,
- const uint8* src_ptr, int src_stride,
+ const uint8* src_ptr, ptrdiff_t src_stride,
int dst_width, int source_y_fraction) {
- if (source_y_fraction == 0) {
- asm volatile(
- "1:"
- "movdqa (%1),%%xmm0\n"
- "lea 0x10(%1),%1\n"
- "movdqa %%xmm0,(%0)\n"
- "lea 0x10(%0),%0\n"
- "sub $0x10,%2\n"
- "ja 1b\n"
- "mov -0x1(%0),%%al\n"
- "mov %%al,(%0)\n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "rax", "xmm0"
- );
- return;
- } else if (source_y_fraction == 128) {
- asm volatile(
- "1:"
- "movdqa (%1),%%xmm0\n"
- "movdqa (%1,%3,1),%%xmm2\n"
- "lea 0x10(%1),%1\n"
- "pavgb %%xmm2,%%xmm0\n"
- "movdqa %%xmm0,(%0)\n"
- "lea 0x10(%0),%0\n"
- "sub $0x10,%2\n"
- "ja 1b\n"
- "mov -0x1(%0),%%al\n"
- "mov %%al,(%0)\n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_width) // %2
- : "r"(static_cast<intptr_t>(src_stride)) // %3
- : "memory", "rax", "xmm0", "xmm2"
- );
- return;
- } else {
- asm volatile(
- "mov %3,%%eax\n"
- "movd %%eax,%%xmm6\n"
- "punpcklwd %%xmm6,%%xmm6\n"
- "pshufd $0x0,%%xmm6,%%xmm6\n"
- "neg %%eax\n"
- "add $0x100,%%eax\n"
- "movd %%eax,%%xmm5\n"
- "punpcklwd %%xmm5,%%xmm5\n"
- "pshufd $0x0,%%xmm5,%%xmm5\n"
- "pxor %%xmm7,%%xmm7\n"
- "1:"
- "movdqa (%1),%%xmm0\n"
- "movdqa (%1,%4,1),%%xmm2\n"
- "lea 0x10(%1),%1\n"
- "movdqa %%xmm0,%%xmm1\n"
- "movdqa %%xmm2,%%xmm3\n"
- "punpcklbw %%xmm7,%%xmm0\n"
- "punpcklbw %%xmm7,%%xmm2\n"
- "punpckhbw %%xmm7,%%xmm1\n"
- "punpckhbw %%xmm7,%%xmm3\n"
- "pmullw %%xmm5,%%xmm0\n"
- "pmullw %%xmm5,%%xmm1\n"
- "pmullw %%xmm6,%%xmm2\n"
- "pmullw %%xmm6,%%xmm3\n"
- "paddusw %%xmm2,%%xmm0\n"
- "paddusw %%xmm3,%%xmm1\n"
- "psrlw $0x8,%%xmm0\n"
- "psrlw $0x8,%%xmm1\n"
- "packuswb %%xmm1,%%xmm0\n"
- "movdqa %%xmm0,(%0)\n"
- "lea 0x10(%0),%0\n"
- "sub $0x10,%2\n"
- "ja 1b\n"
- "mov -0x1(%0),%%al\n"
- "mov %%al,(%0)\n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(source_y_fraction) // %3
- : "r"(static_cast<intptr_t>(src_stride)) // %4
- : "memory", "rax", "xmm0", "xmm1", "xmm2", "xmm3",
- "xmm5", "xmm6", "xmm7"
- );
- }
- return;
+ asm volatile (
+ "sub %1,%0 \n"
+ "cmp $0x0,%3 \n"
+ "je 2f \n"
+ "cmp $0x80,%3 \n"
+ "je 3f \n"
+ "movd %3,%%xmm5 \n"
+ "punpcklbw %%xmm5,%%xmm5 \n"
+ "punpcklwd %%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%1),%%xmm0 \n"
+ "movdqa (%1,%4,1),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklbw %%xmm4,%%xmm2 \n"
+ "punpckhbw %%xmm4,%%xmm3 \n"
+ "punpcklbw %%xmm4,%%xmm0 \n"
+ "punpckhbw %%xmm4,%%xmm1 \n"
+ "psubw %%xmm0,%%xmm2 \n"
+ "psubw %%xmm1,%%xmm3 \n"
+ "pmulhw %%xmm5,%%xmm2 \n"
+ "pmulhw %%xmm5,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ "jmp 4f \n"
+ ".p2align 4 \n"
+ "2: \n"
+ "movdqa (%1),%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 2b \n"
+ "jmp 4f \n"
+ ".p2align 4 \n"
+ "3: \n"
+ "movdqa (%1),%%xmm0 \n"
+ "pavgb (%1,%4,1),%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 3b \n"
+ ".p2align 4 \n"
+ "4: \n"
+ "punpckhbw %%xmm0,%%xmm0 \n"
+ "pshufhw $0xff,%%xmm0,%%xmm0 \n"
+ "punpckhqdq %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm0,(%1,%0,1) \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(source_y_fraction) // %3
+ : "r"(static_cast<intptr_t>(src_stride)) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
}
+#endif // SSE2_DISABLED
// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version
#define HAS_SCALEFILTERROWS_SSSE3
static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
- const uint8* src_ptr, int src_stride,
+ const uint8* src_ptr, ptrdiff_t src_stride,
int dst_width, int source_y_fraction) {
- if (source_y_fraction == 0) {
- asm volatile(
- "1:"
- "movdqa (%1),%%xmm0\n"
- "lea 0x10(%1),%1\n"
- "movdqa %%xmm0,(%0)\n"
- "lea 0x10(%0),%0\n"
- "sub $0x10,%2\n"
- "ja 1b\n"
- "mov -0x1(%0),%%al\n"
- "mov %%al,(%0)\n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "rax", "xmm0"
- );
- return;
- } else if (source_y_fraction == 128) {
- asm volatile(
- "1:"
- "movdqa (%1),%%xmm0\n"
- "movdqa (%1,%3,1),%%xmm2\n"
- "lea 0x10(%1),%1\n"
- "pavgb %%xmm2,%%xmm0\n"
- "movdqa %%xmm0,(%0)\n"
- "lea 0x10(%0),%0\n"
- "sub $0x10,%2\n"
- "ja 1b\n"
- "mov -0x1(%0),%%al\n"
- "mov %%al,(%0)\n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_width) // %2
- : "r"(static_cast<intptr_t>(src_stride)) // %3
- : "memory", "rax", "xmm0", "xmm2"
- );
- return;
- } else {
- asm volatile(
- "mov %3,%%eax\n"
- "shr %%eax\n"
- "mov %%al,%%ah\n"
- "neg %%al\n"
- "add $0x80,%%al\n"
- "movd %%eax,%%xmm7\n"
- "punpcklwd %%xmm7,%%xmm7\n"
- "pshufd $0x0,%%xmm7,%%xmm7\n"
- "1:"
- "movdqa (%1),%%xmm0\n"
- "movdqa (%1,%4,1),%%xmm2\n"
- "lea 0x10(%1),%1\n"
- "movdqa %%xmm0,%%xmm1\n"
- "punpcklbw %%xmm2,%%xmm0\n"
- "punpckhbw %%xmm2,%%xmm1\n"
- "pmaddubsw %%xmm7,%%xmm0\n"
- "pmaddubsw %%xmm7,%%xmm1\n"
- "psrlw $0x7,%%xmm0\n"
- "psrlw $0x7,%%xmm1\n"
- "packuswb %%xmm1,%%xmm0\n"
- "movdqa %%xmm0,(%0)\n"
- "lea 0x10(%0),%0\n"
- "sub $0x10,%2\n"
- "ja 1b\n"
- "mov -0x1(%0),%%al\n"
- "mov %%al,(%0)\n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(source_y_fraction) // %3
- : "r"(static_cast<intptr_t>(src_stride)) // %4
- : "memory", "rax", "xmm0", "xmm1", "xmm2", "xmm7"
- );
- }
- return;
-}
-#endif
+ asm volatile (
+ "sub %1,%0 \n"
+ "shr %3 \n"
+ "cmp $0x0,%3 \n"
+ "je 2f \n"
+ "cmp $0x40,%3 \n"
+ "je 3f \n"
+ "movd %3,%%xmm0 \n"
+ "neg %3 \n"
+ "add $0x80,%3 \n"
+ "movd %3,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n"
+ "punpcklwd %%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%1),%%xmm0 \n"
+ "movdqa (%1,%4,1),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "pmaddubsw %%xmm5,%%xmm0 \n"
+ "pmaddubsw %%xmm5,%%xmm1 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ "jmp 4f \n"
+ ".p2align 4 \n"
+ "2: \n"
+ "movdqa (%1),%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 2b \n"
+ "jmp 4f \n"
+ ".p2align 4 \n"
+ "3: \n"
+ "movdqa (%1),%%xmm0 \n"
+ "pavgb (%1,%4,1),%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 3b \n"
+ ".p2align 4 \n"
+ "4: \n"
+ "punpckhbw %%xmm0,%%xmm0 \n"
+ "pshufhw $0xff,%%xmm0,%%xmm0 \n"
+ "punpckhqdq %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm0,(%1,%0,1) \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(source_y_fraction) // %3
+ : "r"(static_cast<intptr_t>(src_stride)) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm5"
#endif
+ );
+}
+#endif // defined(__x86_64__) || defined(__i386__)
// CPU agnostic row functions
-static void ScaleRowDown2_C(const uint8* src_ptr, int,
+static void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst, int dst_width) {
- for (int x = 0; x < dst_width; ++x) {
- *dst++ = *src_ptr;
- src_ptr += 2;
+ uint8* dend = dst + dst_width - 1;
+ do {
+ dst[0] = src_ptr[0];
+ dst[1] = src_ptr[2];
+ dst += 2;
+ src_ptr += 4;
+ } while (dst < dend);
+ if (dst_width & 1) {
+ dst[0] = src_ptr[0];
}
}
-static void ScaleRowDown2Int_C(const uint8* src_ptr, int src_stride,
- uint8* dst, int dst_width) {
- for (int x = 0; x < dst_width; ++x) {
- *dst++ = (src_ptr[0] + src_ptr[1] +
- src_ptr[src_stride] + src_ptr[src_stride + 1] + 2) >> 2;
- src_ptr += 2;
+void ScaleRowDown2Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ const uint8* s = src_ptr;
+ const uint8* t = src_ptr + src_stride;
+ uint8* dend = dst + dst_width - 1;
+ do {
+ dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+ dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+ dst += 2;
+ s += 4;
+ t += 4;
+ } while (dst < dend);
+ if (dst_width & 1) {
+ dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
}
}
-static void ScaleRowDown4_C(const uint8* src_ptr, int,
+static void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst, int dst_width) {
- for (int x = 0; x < dst_width; ++x) {
- *dst++ = *src_ptr;
- src_ptr += 4;
+ uint8* dend = dst + dst_width - 1;
+ do {
+ dst[0] = src_ptr[0];
+ dst[1] = src_ptr[4];
+ dst += 2;
+ src_ptr += 8;
+ } while (dst < dend);
+ if (dst_width & 1) {
+ dst[0] = src_ptr[0];
}
}
-static void ScaleRowDown4Int_C(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown4Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
- for (int x = 0; x < dst_width; ++x) {
- *dst++ = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
- src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
- src_ptr[src_stride + 2] + src_ptr[src_stride + 3] +
- src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] +
- src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] +
- src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] +
- src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] +
- 8) >> 4;
- src_ptr += 4;
+ intptr_t stride = src_stride;
+ uint8* dend = dst + dst_width - 1;
+ do {
+ dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+ src_ptr[stride + 0] + src_ptr[stride + 1] +
+ src_ptr[stride + 2] + src_ptr[stride + 3] +
+ src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+ src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+ src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+ src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+ 8) >> 4;
+ dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
+ src_ptr[stride + 4] + src_ptr[stride + 5] +
+ src_ptr[stride + 6] + src_ptr[stride + 7] +
+ src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
+ src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
+ src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
+ src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
+ 8) >> 4;
+ dst += 2;
+ src_ptr += 8;
+ } while (dst < dend);
+ if (dst_width & 1) {
+ dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+ src_ptr[stride + 0] + src_ptr[stride + 1] +
+ src_ptr[stride + 2] + src_ptr[stride + 3] +
+ src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+ src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+ src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+ src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+ 8) >> 4;
}
}
@@ -2493,19 +2033,25 @@ static void ScaleRowDown4Int_C(const uint8* src_ptr, int src_stride,
static const int kMaxOutputWidth = 640;
static const int kMaxRow12 = kMaxOutputWidth * 2;
-static void ScaleRowDown8_C(const uint8* src_ptr, int,
+static void ScaleRowDown8_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst, int dst_width) {
- for (int x = 0; x < dst_width; ++x) {
- *dst++ = *src_ptr;
- src_ptr += 8;
+ uint8* dend = dst + dst_width - 1;
+ do {
+ dst[0] = src_ptr[0];
+ dst[1] = src_ptr[8];
+ dst += 2;
+ src_ptr += 16;
+ } while (dst < dend);
+ if (dst_width & 1) {
+ dst[0] = src_ptr[0];
}
}
// Note calling code checks width is less than max and if not
// uses ScaleRowDown8_C instead.
-static void ScaleRowDown8Int_C(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown8Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
- ALIGN16(uint8 src_row[kMaxRow12 * 2]);
+ SIMD_ALIGNED(uint8 src_row[kMaxRow12 * 2]);
assert(dst_width <= kMaxOutputWidth);
ScaleRowDown4Int_C(src_ptr, src_stride, src_row, dst_width * 2);
ScaleRowDown4Int_C(src_ptr + src_stride * 4, src_stride,
@@ -2514,7 +2060,7 @@ static void ScaleRowDown8Int_C(const uint8* src_ptr, int src_stride,
ScaleRowDown2Int_C(src_row, kMaxOutputWidth, dst, dst_width);
}
-static void ScaleRowDown34_C(const uint8* src_ptr, int,
+static void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst, int dst_width) {
assert((dst_width % 3 == 0) && (dst_width > 0));
uint8* dend = dst + dst_width;
@@ -2528,12 +2074,12 @@ static void ScaleRowDown34_C(const uint8* src_ptr, int,
}
// Filter rows 0 and 1 together, 3 : 1
-static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width) {
assert((dst_width % 3 == 0) && (dst_width > 0));
- uint8* dend = d + dst_width;
const uint8* s = src_ptr;
const uint8* t = src_ptr + src_stride;
+ uint8* dend = d + dst_width;
do {
uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
@@ -2551,12 +2097,12 @@ static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride,
}
// Filter rows 1 and 2 together, 1 : 1
-static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width) {
assert((dst_width % 3 == 0) && (dst_width > 0));
- uint8* dend = d + dst_width;
const uint8* s = src_ptr;
const uint8* t = src_ptr + src_stride;
+ uint8* dend = d + dst_width;
do {
uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
@@ -2573,13 +2119,42 @@ static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride,
} while (d < dend);
}
+// (1-f)a + fb can be replaced with a + f(b-a)
+#define BLENDER(a, b, f) (static_cast<int>(a) + \
+ ((f) * (static_cast<int>(b) - static_cast<int>(a)) >> 16))
+
+static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx) {
+ for (int j = 0; j < dst_width - 1; j += 2) {
+ int xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ xi = x >> 16;
+ a = src_ptr[xi];
+ b = src_ptr[xi + 1];
+ dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ int xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ }
+}
+
+static const int kMaxInputWidth = 2560;
+
#if defined(HAS_SCALEFILTERROWS_SSE2)
// Filter row to 3/4
static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr,
int dst_width) {
assert((dst_width % 3 == 0) && (dst_width > 0));
- uint8* dend = dst_ptr + dst_width;
const uint8* s = src_ptr;
+ uint8* dend = dst_ptr + dst_width;
do {
dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2;
dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1;
@@ -2588,45 +2163,30 @@ static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr,
s += 4;
} while (dst_ptr < dend);
}
-#endif
-
-static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int dx) {
- int x = 0;
- for (int j = 0; j < dst_width; ++j) {
- int xi = x >> 16;
- int xf1 = x & 0xffff;
- int xf0 = 65536 - xf1;
-
- *dst_ptr++ = (src_ptr[xi] * xf0 + src_ptr[xi + 1] * xf1) >> 16;
- x += dx;
- }
-}
-static const int kMaxInputWidth = 2560;
-#if defined(HAS_SCALEFILTERROWS_SSE2)
-#define HAS_SCALEROWDOWN34_SSE2
+#define HAS_SCALEROWDOWN34_SSE2_DISABLED
// Filter rows 0 and 1 together, 3 : 1
-static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
assert((dst_width % 3 == 0) && (dst_width > 0));
- ALIGN16(uint8 row[kMaxInputWidth]);
- ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3,
- 256 / 4);
+ SIMD_ALIGNED(uint8 row[kMaxInputWidth]);
+ ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 4);
ScaleFilterCols34_C(dst_ptr, row, dst_width);
}
// Filter rows 1 and 2 together, 1 : 1
-static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
assert((dst_width % 3 == 0) && (dst_width > 0));
- ALIGN16(uint8 row[kMaxInputWidth]);
+ SIMD_ALIGNED(uint8 row[kMaxInputWidth]);
ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 2);
ScaleFilterCols34_C(dst_ptr, row, dst_width);
}
#endif
-static void ScaleRowDown38_C(const uint8* src_ptr, int,
+static void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst, int dst_width) {
assert(dst_width % 3 == 0);
for (int x = 0; x < dst_width; x += 3) {
@@ -2639,23 +2199,25 @@ static void ScaleRowDown38_C(const uint8* src_ptr, int,
}
// 8x3 -> 3x1
-static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown38_3_Int_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
assert((dst_width % 3 == 0) && (dst_width > 0));
- for (int i = 0; i < dst_width; i+=3) {
+ intptr_t stride = src_stride;
+ for (int i = 0; i < dst_width; i += 3) {
dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
- src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
- src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] +
- src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) *
+ src_ptr[stride + 0] + src_ptr[stride + 1] +
+ src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+ src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
(65536 / 9) >> 16;
dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
- src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
- src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] +
- src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) *
+ src_ptr[stride + 3] + src_ptr[stride + 4] +
+ src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+ src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
(65536 / 9) >> 16;
dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
- src_ptr[src_stride + 6] + src_ptr[src_stride + 7] +
- src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7]) *
+ src_ptr[stride + 6] + src_ptr[stride + 7] +
+ src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
(65536 / 6) >> 16;
src_ptr += 8;
dst_ptr += 3;
@@ -2663,18 +2225,19 @@ static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride,
}
// 8x2 -> 3x1
-static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, int src_stride,
+static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
assert((dst_width % 3 == 0) && (dst_width > 0));
- for (int i = 0; i < dst_width; i+=3) {
+ intptr_t stride = src_stride;
+ for (int i = 0; i < dst_width; i += 3) {
dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
- src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
- src_ptr[src_stride + 2]) * (65536 / 6) >> 16;
+ src_ptr[stride + 0] + src_ptr[stride + 1] +
+ src_ptr[stride + 2]) * (65536 / 6) >> 16;
dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
- src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
- src_ptr[src_stride + 5]) * (65536 / 6) >> 16;
+ src_ptr[stride + 3] + src_ptr[stride + 4] +
+ src_ptr[stride + 5]) * (65536 / 6) >> 16;
dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
- src_ptr[src_stride + 6] + src_ptr[src_stride + 7]) *
+ src_ptr[stride + 6] + src_ptr[stride + 7]) *
(65536 / 4) >> 16;
src_ptr += 8;
dst_ptr += 3;
@@ -2683,7 +2246,7 @@ static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, int src_stride,
// C version 8x2 -> 8x1
static void ScaleFilterRows_C(uint8* dst_ptr,
- const uint8* src_ptr, int src_stride,
+ const uint8* src_ptr, ptrdiff_t src_stride,
int dst_width, int source_y_fraction) {
assert(dst_width > 0);
int y1_fraction = source_y_fraction;
@@ -2706,7 +2269,7 @@ static void ScaleFilterRows_C(uint8* dst_ptr,
dst_ptr[0] = dst_ptr[-1];
}
-void ScaleAddRows_C(const uint8* src_ptr, int src_stride,
+void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height) {
assert(src_width > 0);
assert(src_height > 0);
@@ -2728,35 +2291,31 @@ void ScaleAddRows_C(const uint8* src_ptr, int src_stride,
* its original size.
*
*/
-static void ScalePlaneDown2(int src_width, int src_height,
+static void ScalePlaneDown2(int /* src_width */, int /* src_height */,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_ptr, uint8* dst_ptr,
FilterMode filtering) {
- assert(src_width % 2 == 0);
- assert(src_height % 2 == 0);
- void (*ScaleRowDown2)(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width);
-
+ void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) =
+ filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C;
#if defined(HAS_SCALEROWDOWN2_NEON)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
- (dst_width % 16 == 0) && (src_stride % 16 == 0) &&
- (dst_stride % 16 == 0) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 16)) {
+ if (TestCpuFlag(kCpuHasNEON) &&
+ IS_ALIGNED(dst_width, 16)) {
ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON;
- } else
-#endif
-#if defined(HAS_SCALEROWDOWN2_SSE2)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
- (dst_width % 16 == 0) && IS_ALIGNED(src_ptr, 16) &&
- IS_ALIGNED(dst_ptr, 16)) {
- ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2;
- } else
-#endif
- {
- ScaleRowDown2 = filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C;
}
+#elif defined(HAS_SCALEROWDOWN2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
+ ScaleRowDown2 = filtering ? ScaleRowDown2Int_Unaligned_SSE2 :
+ ScaleRowDown2_Unaligned_SSE2;
+ if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+ ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2;
+ }
+ }
+#endif
+ // TODO(fbarchard): Loop through source height to allow odd height.
for (int y = 0; y < dst_height; ++y) {
ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
src_ptr += (src_stride << 1);
@@ -2770,34 +2329,26 @@ static void ScalePlaneDown2(int src_width, int src_height,
* This is an optimized version for scaling down a plane to 1/4 of
* its original size.
*/
-static void ScalePlaneDown4(int src_width, int src_height,
+static void ScalePlaneDown4(int /* src_width */, int /* src_height */,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_ptr, uint8* dst_ptr,
FilterMode filtering) {
- assert(src_width % 4 == 0);
- assert(src_height % 4 == 0);
- void (*ScaleRowDown4)(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width);
-
+ void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) =
+ filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C;
#if defined(HAS_SCALEROWDOWN4_NEON)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
- (dst_width % 2 == 0) && (src_stride % 8 == 0) &&
- IS_ALIGNED(src_ptr, 8)) {
+ if (TestCpuFlag(kCpuHasNEON) &&
+ IS_ALIGNED(dst_width, 4)) {
ScaleRowDown4 = filtering ? ScaleRowDown4Int_NEON : ScaleRowDown4_NEON;
- } else
-#endif
-#if defined(HAS_SCALEROWDOWN4_SSE2)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
- (dst_width % 8 == 0) && (src_stride % 16 == 0) &&
- (dst_stride % 8 == 0) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) {
+ }
+#elif defined(HAS_SCALEROWDOWN4_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(dst_width, 8) &&
+ IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2;
- } else
-#endif
- {
- ScaleRowDown4 = filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C;
}
+#endif
for (int y = 0; y < dst_height; ++y) {
ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
@@ -2813,27 +2364,23 @@ static void ScalePlaneDown4(int src_width, int src_height,
* of its original size.
*
*/
-static void ScalePlaneDown8(int src_width, int src_height,
+static void ScalePlaneDown8(int /* src_width */, int /* src_height */,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_ptr, uint8* dst_ptr,
FilterMode filtering) {
- assert(src_width % 8 == 0);
- assert(src_height % 8 == 0);
- void (*ScaleRowDown8)(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width);
+ void (*ScaleRowDown8)(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) =
+ filtering && (dst_width <= kMaxOutputWidth) ?
+ ScaleRowDown8Int_C : ScaleRowDown8_C;
#if defined(HAS_SCALEROWDOWN8_SSE2)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
- (dst_width % 16 == 0) && dst_width <= kMaxOutputWidth &&
- (src_stride % 16 == 0) && (dst_stride % 16 == 0) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(dst_width, 4) &&
+ IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
ScaleRowDown8 = filtering ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2;
- } else
-#endif
- {
- ScaleRowDown8 = filtering && (dst_width <= kMaxOutputWidth) ?
- ScaleRowDown8Int_C : ScaleRowDown8_C;
}
+#endif
+
for (int y = 0; y < dst_height; ++y) {
ScaleRowDown8(src_ptr, src_stride, dst_ptr, dst_width);
src_ptr += (src_stride << 3);
@@ -2847,72 +2394,75 @@ static void ScalePlaneDown8(int src_width, int src_height,
* Provided by Frank Barchard (fbarchard@google.com)
*
*/
-static void ScalePlaneDown34(int src_width, int src_height,
+static void ScalePlaneDown34(int /* src_width */, int /* src_height */,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_ptr, uint8* dst_ptr,
FilterMode filtering) {
assert(dst_width % 3 == 0);
- void (*ScaleRowDown34_0)(const uint8* src_ptr, int src_stride,
+ void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
- void (*ScaleRowDown34_1)(const uint8* src_ptr, int src_stride,
+ void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
-#if defined(HAS_SCALEROWDOWN34_SSSE3)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
- (dst_width % 24 == 0) && (src_stride % 16 == 0) &&
- (dst_stride % 8 == 0) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_C;
+ ScaleRowDown34_1 = ScaleRowDown34_C;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Int_C;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Int_C;
+ }
+#if defined(HAS_SCALEROWDOWN34_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) {
if (!filtering) {
- ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
- ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
+ ScaleRowDown34_0 = ScaleRowDown34_NEON;
+ ScaleRowDown34_1 = ScaleRowDown34_NEON;
} else {
- ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3;
- ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3;
+ ScaleRowDown34_0 = ScaleRowDown34_0_Int_NEON;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Int_NEON;
}
- } else
+ }
#endif
#if defined(HAS_SCALEROWDOWN34_SSE2)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
- (dst_width % 24 == 0) && (src_stride % 16 == 0) &&
- (dst_stride % 8 == 0) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8) &&
- filtering) {
+ if (TestCpuFlag(kCpuHasSSE2) && (dst_width % 24 == 0) &&
+ IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && filtering) {
ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2;
ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2;
- } else
+ }
#endif
- {
+#if defined(HAS_SCALEROWDOWN34_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
+ IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
if (!filtering) {
- ScaleRowDown34_0 = ScaleRowDown34_C;
- ScaleRowDown34_1 = ScaleRowDown34_C;
+ ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
+ ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
} else {
- ScaleRowDown34_0 = ScaleRowDown34_0_Int_C;
- ScaleRowDown34_1 = ScaleRowDown34_1_Int_C;
+ ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3;
}
}
- int src_row = 0;
- for (int y = 0; y < dst_height; ++y) {
- switch (src_row) {
- case 0:
- ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
- break;
-
- case 1:
- ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width);
- break;
-
- case 2:
- ScaleRowDown34_0(src_ptr + src_stride, -src_stride,
- dst_ptr, dst_width);
- break;
- }
- ++src_row;
+#endif
+
+ for (int y = 0; y < dst_height - 2; y += 3) {
+ ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
src_ptr += src_stride;
dst_ptr += dst_stride;
- if (src_row >= 3) {
- src_ptr += src_stride;
- src_row = 0;
- }
+ ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ScaleRowDown34_0(src_ptr + src_stride, -src_stride,
+ dst_ptr, dst_width);
+ src_ptr += src_stride * 2;
+ dst_ptr += dst_stride;
+ }
+
+ // Remainder 1 or 2 rows with last row vertically unfiltered
+ if ((dst_height % 3) == 2) {
+ ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
+ } else if ((dst_height % 3) == 1) {
+ ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);
}
}
@@ -2922,23 +2472,47 @@ static void ScalePlaneDown34(int src_width, int src_height,
* This is an optimized version for scaling down a plane to 3/8
* of its original size.
*
- * Reduces 16x3 to 6x1
+ * Uses box filter arranges like this
+ * aaabbbcc -> abc
+ * aaabbbcc def
+ * aaabbbcc ghi
+ * dddeeeff
+ * dddeeeff
+ * dddeeeff
+ * ggghhhii
+ * ggghhhii
+ * Boxes are 3x3, 2x3, 3x2 and 2x2
*/
-static void ScalePlaneDown38(int src_width, int src_height,
+static void ScalePlaneDown38(int /* src_width */, int /* src_height */,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_ptr, uint8* dst_ptr,
FilterMode filtering) {
assert(dst_width % 3 == 0);
- void (*ScaleRowDown38_3)(const uint8* src_ptr, int src_stride,
+ void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
- void (*ScaleRowDown38_2)(const uint8* src_ptr, int src_stride,
+ void (*ScaleRowDown38_2)(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
-#if defined(HAS_SCALEROWDOWN38_SSSE3)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
- (dst_width % 24 == 0) && (src_stride % 16 == 0) &&
- (dst_stride % 8 == 0) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_C;
+ ScaleRowDown38_2 = ScaleRowDown38_C;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Int_C;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Int_C;
+ }
+#if defined(HAS_SCALEROWDOWN38_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_NEON;
+ ScaleRowDown38_2 = ScaleRowDown38_NEON;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Int_NEON;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Int_NEON;
+ }
+ }
+#elif defined(HAS_SCALEROWDOWN38_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
+ IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
if (!filtering) {
ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
@@ -2946,39 +2520,34 @@ static void ScalePlaneDown38(int src_width, int src_height,
ScaleRowDown38_3 = ScaleRowDown38_3_Int_SSSE3;
ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3;
}
- } else
+ }
#endif
- {
- if (!filtering) {
- ScaleRowDown38_3 = ScaleRowDown38_C;
- ScaleRowDown38_2 = ScaleRowDown38_C;
- } else {
- ScaleRowDown38_3 = ScaleRowDown38_3_Int_C;
- ScaleRowDown38_2 = ScaleRowDown38_2_Int_C;
- }
+
+ for (int y = 0; y < dst_height - 2; y += 3) {
+ ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 3;
+ dst_ptr += dst_stride;
+ ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 3;
+ dst_ptr += dst_stride;
+ ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 2;
+ dst_ptr += dst_stride;
}
- int src_row = 0;
- for (int y = 0; y < dst_height; ++y) {
- switch (src_row) {
- case 0:
- case 1:
- ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
- src_ptr += src_stride * 3;
- ++src_row;
- break;
-
- case 2:
- ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width);
- src_ptr += src_stride * 2;
- src_row = 0;
- break;
- }
+
+ // Remainder 1 or 2 rows with last row vertically unfiltered
+ if ((dst_height % 3) == 2) {
+ ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 3;
dst_ptr += dst_stride;
+ ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+ } else if ((dst_height % 3) == 1) {
+ ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
}
}
-inline static uint32 SumBox(int iboxwidth, int iboxheight,
- int src_stride, const uint8* src_ptr) {
+static __inline uint32 SumBox(int iboxwidth, int iboxheight,
+ ptrdiff_t src_stride, const uint8* src_ptr) {
assert(iboxwidth > 0);
assert(iboxheight > 0);
uint32 sum = 0u;
@@ -2991,10 +2560,9 @@ inline static uint32 SumBox(int iboxwidth, int iboxheight,
return sum;
}
-static void ScalePlaneBoxRow(int dst_width, int boxheight,
- int dx, int src_stride,
- const uint8* src_ptr, uint8* dst_ptr) {
- int x = 0;
+static void ScalePlaneBoxRow_C(int dst_width, int boxheight,
+ int x, int dx, ptrdiff_t src_stride,
+ const uint8* src_ptr, uint8* dst_ptr) {
for (int i = 0; i < dst_width; ++i) {
int ix = x >> 16;
x += dx;
@@ -3004,7 +2572,7 @@ static void ScalePlaneBoxRow(int dst_width, int boxheight,
}
}
-inline static uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
+static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
assert(iboxwidth > 0);
uint32 sum = 0u;
for (int x = 0; x < iboxwidth; ++x) {
@@ -3013,14 +2581,13 @@ inline static uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
return sum;
}
-static void ScaleAddCols2_C(int dst_width, int boxheight, int dx,
+static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,
const uint16* src_ptr, uint8* dst_ptr) {
int scaletbl[2];
int minboxwidth = (dx >> 16);
scaletbl[0] = 65536 / (minboxwidth * boxheight);
scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
int *scaleptr = scaletbl - minboxwidth;
- int x = 0;
for (int i = 0; i < dst_width; ++i) {
int ix = x >> 16;
x += dx;
@@ -3029,11 +2596,10 @@ static void ScaleAddCols2_C(int dst_width, int boxheight, int dx,
}
}
-static void ScaleAddCols1_C(int dst_width, int boxheight, int dx,
+static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
const uint16* src_ptr, uint8* dst_ptr) {
int boxwidth = (dx >> 16);
int scaleval = 65536 / (boxwidth * boxheight);
- int x = 0;
for (int i = 0; i < dst_width; ++i) {
*dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
x += boxwidth;
@@ -3055,61 +2621,56 @@ static void ScalePlaneBox(int src_width, int src_height,
const uint8* src_ptr, uint8* dst_ptr) {
assert(dst_width > 0);
assert(dst_height > 0);
- int dy = (src_height << 16) / dst_height;
int dx = (src_width << 16) / dst_width;
- if ((src_width % 16 != 0) || (src_width > kMaxInputWidth) ||
+ int dy = (src_height << 16) / dst_height;
+ int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
+ int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
+ int maxy = (src_height << 16);
+ if (!IS_ALIGNED(src_width, 16) || (src_width > kMaxInputWidth) ||
dst_height * 2 > src_height) {
uint8* dst = dst_ptr;
- int dy = (src_height << 16) / dst_height;
- int dx = (src_width << 16) / dst_width;
- int y = 0;
for (int j = 0; j < dst_height; ++j) {
int iy = y >> 16;
- const uint8* const src = src_ptr + iy * src_stride;
+ const uint8* src = src_ptr + iy * src_stride;
y += dy;
- if (y > (src_height << 16)) {
- y = (src_height << 16);
+ if (y > maxy) {
+ y = maxy;
}
int boxheight = (y >> 16) - iy;
- ScalePlaneBoxRow(dst_width, boxheight,
- dx, src_stride,
- src, dst);
-
+ ScalePlaneBoxRow_C(dst_width, boxheight,
+ x, dx, src_stride,
+ src, dst);
dst += dst_stride;
}
} else {
- ALIGN16(uint16 row[kMaxInputWidth]);
- void (*ScaleAddRows)(const uint8* src_ptr, int src_stride,
- uint16* dst_ptr, int src_width, int src_height);
- void (*ScaleAddCols)(int dst_width, int boxheight, int dx,
+ SIMD_ALIGNED(uint16 row[kMaxInputWidth]);
+ void (*ScaleAddRows)(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int src_width, int src_height)=
+ ScaleAddRows_C;
+ void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
const uint16* src_ptr, uint8* dst_ptr);
-#if defined(HAS_SCALEADDROWS_SSE2)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
- (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) &&
- (src_width % 16) == 0) {
- ScaleAddRows = ScaleAddRows_SSE2;
- } else
-#endif
- {
- ScaleAddRows = ScaleAddRows_C;
- }
if (dx & 0xffff) {
ScaleAddCols = ScaleAddCols2_C;
} else {
ScaleAddCols = ScaleAddCols1_C;
}
+#if defined(HAS_SCALEADDROWS_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) {
+ ScaleAddRows = ScaleAddRows_SSE2;
+ }
+#endif
- int y = 0;
for (int j = 0; j < dst_height; ++j) {
int iy = y >> 16;
- const uint8* const src = src_ptr + iy * src_stride;
+ const uint8* src = src_ptr + iy * src_stride;
y += dy;
if (y > (src_height << 16)) {
y = (src_height << 16);
}
int boxheight = (y >> 16) - iy;
ScaleAddRows(src, src_stride, row, src_width, boxheight);
- ScaleAddCols(dst_width, boxheight, dx, row, dst_ptr);
+ ScaleAddCols(dst_width, boxheight, x, dx, row, dst_ptr);
dst_ptr += dst_stride;
}
}
@@ -3122,33 +2683,34 @@ static void ScalePlaneBilinearSimple(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_ptr, uint8* dst_ptr) {
- uint8* dst = dst_ptr;
int dx = (src_width << 16) / dst_width;
int dy = (src_height << 16) / dst_height;
- int maxx = ((src_width - 1) << 16) - 1;
- int maxy = ((src_height - 1) << 16) - 1;
- int y = (dst_height < src_height) ? 32768 :
- (src_height << 16) / dst_height - 32768;
+ int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
+ int maxx = (src_width > 1) ? ((src_width - 1) << 16) - 1 : 0;
+ int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
for (int i = 0; i < dst_height; ++i) {
- int cy = (y < 0) ? 0 : y;
- int yi = cy >> 16;
- int yf = cy & 0xffff;
- const uint8* const src = src_ptr + yi * src_stride;
- int x = (dst_width < src_width) ? 32768 :
- (src_width << 16) / dst_width - 32768;
+ int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
+ int yi = y >> 16;
+ int yf = y & 0xffff;
+ const uint8* src0 = src_ptr + yi * src_stride;
+ const uint8* src1 = (yi < src_height - 1) ? src0 + src_stride : src0;
+ uint8* dst = dst_ptr;
for (int j = 0; j < dst_width; ++j) {
- int cx = (x < 0) ? 0 : x;
- int xi = cx >> 16;
- int xf = cx & 0xffff;
- int r0 = (src[xi] * (65536 - xf) + src[xi + 1] * xf) >> 16;
- int r1 = (src[xi + src_stride] * (65536 - xf) +
- src[xi + src_stride + 1] * xf) >> 16;
- *dst++ = (r0 * (65536 - yf) + r1 * yf) >> 16;
+ int xi = x >> 16;
+ int xf = x & 0xffff;
+ int x1 = (xi < src_width - 1) ? xi + 1 : xi;
+ int a = src0[xi];
+ int b = src0[x1];
+ int r0 = BLENDER(a, b, xf);
+ a = src1[xi];
+ b = src1[x1];
+ int r1 = BLENDER(a, b, xf);
+ *dst++ = BLENDER(r0, r1, yf);
x += dx;
if (x > maxx)
x = maxx;
}
- dst += dst_stride - dst_width;
+ dst_ptr += dst_stride;
y += dy;
if (y > maxy)
y = maxy;
@@ -3159,52 +2721,51 @@ static void ScalePlaneBilinearSimple(int src_width, int src_height,
* Scale plane to/from any dimensions, with bilinear
* interpolation.
*/
-static void ScalePlaneBilinear(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr) {
+void ScalePlaneBilinear(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr) {
assert(dst_width > 0);
assert(dst_height > 0);
- int dy = (src_height << 16) / dst_height;
- int dx = (src_width << 16) / dst_width;
- if ((src_width % 8 != 0) || (src_width > kMaxInputWidth)) {
+ if (!IS_ALIGNED(src_width, 8) || (src_width > kMaxInputWidth)) {
ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src_ptr, dst_ptr);
} else {
- ALIGN16(uint8 row[kMaxInputWidth + 1]);
+ SIMD_ALIGNED(uint8 row[kMaxInputWidth + 16]);
void (*ScaleFilterRows)(uint8* dst_ptr, const uint8* src_ptr,
- int src_stride,
- int dst_width, int source_y_fraction);
- void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int dx);
-#if defined(HAS_SCALEFILTERROWS_SSSE3)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
- (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) &&
- (src_width % 16) == 0) {
- ScaleFilterRows = ScaleFilterRows_SSSE3;
- } else
+ ptrdiff_t src_stride,
+ int dst_width, int source_y_fraction) =
+ ScaleFilterRows_C;
+#if defined(HAS_SCALEFILTERROWS_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleFilterRows = ScaleFilterRows_NEON;
+ }
#endif
#if defined(HAS_SCALEFILTERROWS_SSE2)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
- (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) &&
- (src_width % 16) == 0) {
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) {
ScaleFilterRows = ScaleFilterRows_SSE2;
- } else
+ }
#endif
- {
- ScaleFilterRows = ScaleFilterRows_C;
+#if defined(HAS_SCALEFILTERROWS_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) &&
+ IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) {
+ ScaleFilterRows = ScaleFilterRows_SSSE3;
}
- ScaleFilterCols = ScaleFilterCols_C;
+#endif
- int y = 0;
- int maxy = ((src_height - 1) << 16) - 1; // max is filter of last 2 rows.
+ int dx = (src_width << 16) / dst_width;
+ int dy = (src_height << 16) / dst_height;
+ int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
+ int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
+ int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
for (int j = 0; j < dst_height; ++j) {
- int iy = y >> 16;
- int fy = (y >> 8) & 255;
- const uint8* const src = src_ptr + iy * src_stride;
- ScaleFilterRows(row, src, src_stride, src_width, fy);
- ScaleFilterCols(dst_ptr, row, dst_width, dx);
+ int yi = y >> 16;
+ int yf = (y >> 8) & 255;
+ const uint8* src = src_ptr + yi * src_stride;
+ ScaleFilterRows(row, src, src_stride, src_width, yf);
+ ScaleFilterCols_C(dst_ptr, row, dst_width, x, dx);
dst_ptr += dst_stride;
y += dy;
if (y > maxy) {
@@ -3224,18 +2785,20 @@ static void ScalePlaneSimple(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_ptr, uint8* dst_ptr) {
- uint8* dst = dst_ptr;
int dx = (src_width << 16) / dst_width;
- for (int y = 0; y < dst_height; ++y) {
- const uint8* const src = src_ptr + (y * src_height / dst_height) *
- src_stride;
- // TODO(fbarchard): Round X coordinate by setting x=0x8000.
- int x = 0;
+ int dy = (src_height << 16) / dst_height;
+ int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
+ for (int j = 0; j < dst_height; ++j) {
+ int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
+ int yi = y >> 16;
+ const uint8* src = src_ptr + yi * src_stride;
+ uint8* dst = dst_ptr;
for (int i = 0; i < dst_width; ++i) {
*dst++ = src[x >> 16];
x += dx;
}
- dst += dst_stride - dst_width;
+ dst_ptr += dst_stride;
+ y += dy;
}
}
@@ -3283,47 +2846,31 @@ static void ScalePlaneDown(int src_width, int src_height,
}
}
-/**
- * Copy plane, no scaling
- *
- * This simply copies the given plane without scaling.
- * The current implementation is ~115 times faster
- * compared to the reference implementation.
- *
- */
-static void CopyPlane(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr) {
- if (src_stride == src_width && dst_stride == dst_width) {
- // All contiguous, so can use REALLY fast path.
- memcpy(dst_ptr, src_ptr, src_width * src_height);
- } else {
- // Not all contiguous; must copy scanlines individually
- const uint8* src = src_ptr;
- uint8* dst = dst_ptr;
- for (int i = 0; i < src_height; ++i) {
- memcpy(dst, src, src_width);
- dst += dst_stride;
- src += src_stride;
- }
+// Scale a plane.
+// This function in turn calls a scaling function suitable for handling
+// the desired resolutions.
+
+LIBYUV_API
+void ScalePlane(const uint8* src, int src_stride,
+ int src_width, int src_height,
+ uint8* dst, int dst_stride,
+ int dst_width, int dst_height,
+ FilterMode filtering) {
+#ifdef CPU_X86
+ // environment variable overrides for testing.
+ char *filter_override = getenv("LIBYUV_FILTER");
+ if (filter_override) {
+ filtering = (FilterMode)atoi(filter_override); // NOLINT
}
-}
-
-static void ScalePlane(const uint8* src, int src_stride,
- int src_width, int src_height,
- uint8* dst, int dst_stride,
- int dst_width, int dst_height,
- FilterMode filtering, bool use_ref) {
+#endif
// Use specialized scales to improve performance for common resolutions.
// For example, all the 1/2 scalings will use ScalePlaneDown2()
if (dst_width == src_width && dst_height == src_height) {
// Straight copy.
- CopyPlane(src_width, src_height, dst_width, dst_height, src_stride,
- dst_stride, src, dst);
+ CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
} else if (dst_width <= src_width && dst_height <= src_height) {
// Scale down.
- if (use_ref) {
+ if (use_reference_impl_) {
// For testing, allow the optimized versions to be disabled.
ScalePlaneDown(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst, filtering);
@@ -3342,11 +2889,13 @@ static void ScalePlane(const uint8* src, int src_stride,
// optimized, 3/8
ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst, filtering);
- } else if (4 * dst_width == src_width && 4 * dst_height == src_height) {
+ } else if (4 * dst_width == src_width && 4 * dst_height == src_height &&
+ filtering != kFilterBilinear) {
// optimized, 1/4
ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst, filtering);
- } else if (8 * dst_width == src_width && 8 * dst_height == src_height) {
+ } else if (8 * dst_width == src_width && 8 * dst_height == src_height &&
+ filtering != kFilterBilinear) {
// optimized, 1/8
ScalePlaneDown8(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst, filtering);
@@ -3362,14 +2911,12 @@ static void ScalePlane(const uint8* src, int src_stride,
}
}
-/**
- * Scale a plane.
- *
- * This function in turn calls a scaling function
- * suitable for handling the desired resolutions.
- *
- */
+// Scale an I420 image.
+// This function in turn calls a scaling function for each plane.
+
+#define UNDER_ALLOCATED_HACK 1
+LIBYUV_API
int I420Scale(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
@@ -3394,23 +2941,47 @@ int I420Scale(const uint8* src_y, int src_stride_y,
src_stride_u = -src_stride_u;
src_stride_v = -src_stride_v;
}
- int halfsrc_width = (src_width + 1) >> 1;
- int halfsrc_height = (src_height + 1) >> 1;
- int halfdst_width = (dst_width + 1) >> 1;
- int halfoheight = (dst_height + 1) >> 1;
+ int src_halfwidth = (src_width + 1) >> 1;
+ int src_halfheight = (src_height + 1) >> 1;
+ int dst_halfwidth = (dst_width + 1) >> 1;
+ int dst_halfheight = (dst_height + 1) >> 1;
+
+#ifdef UNDER_ALLOCATED_HACK
+ // If caller passed width / 2 for stride, adjust halfwidth to match.
+ if ((src_width & 1) && src_stride_u && src_halfwidth > abs(src_stride_u)) {
+ src_halfwidth = src_width >> 1;
+ }
+ if ((dst_width & 1) && dst_stride_u && dst_halfwidth > abs(dst_stride_u)) {
+ dst_halfwidth = dst_width >> 1;
+ }
+ // If caller used height / 2 when computing src_v, it will point into what
+ // should be the src_u plane. Detect this and reduce halfheight to match.
+ int uv_src_plane_size = src_halfwidth * src_halfheight;
+ if ((src_height & 1) &&
+ (src_v > src_u) && (src_v < (src_u + uv_src_plane_size))) {
+ src_halfheight = src_height >> 1;
+ }
+ int uv_dst_plane_size = dst_halfwidth * dst_halfheight;
+ if ((dst_height & 1) &&
+ (dst_v > dst_u) && (dst_v < (dst_u + uv_dst_plane_size))) {
+ dst_halfheight = dst_height >> 1;
+ }
+#endif
ScalePlane(src_y, src_stride_y, src_width, src_height,
dst_y, dst_stride_y, dst_width, dst_height,
- filtering, use_reference_impl_);
- ScalePlane(src_u, src_stride_u, halfsrc_width, halfsrc_height,
- dst_u, dst_stride_u, halfdst_width, halfoheight,
- filtering, use_reference_impl_);
- ScalePlane(src_v, src_stride_v, halfsrc_width, halfsrc_height,
- dst_v, dst_stride_v, halfdst_width, halfoheight,
- filtering, use_reference_impl_);
+ filtering);
+ ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
+ dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
+ filtering);
+ ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
+ dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
+ filtering);
return 0;
}
+// Deprecated api
+LIBYUV_API
int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
int src_stride_y, int src_stride_u, int src_stride_v,
int src_width, int src_height,
@@ -3433,49 +3004,77 @@ int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
src_stride_u = -src_stride_u;
src_stride_v = -src_stride_v;
}
- int halfsrc_width = (src_width + 1) >> 1;
- int halfsrc_height = (src_height + 1) >> 1;
- int halfdst_width = (dst_width + 1) >> 1;
- int halfoheight = (dst_height + 1) >> 1;
+ int src_halfwidth = (src_width + 1) >> 1;
+ int src_halfheight = (src_height + 1) >> 1;
+ int dst_halfwidth = (dst_width + 1) >> 1;
+ int dst_halfheight = (dst_height + 1) >> 1;
FilterMode filtering = interpolate ? kFilterBox : kFilterNone;
+#ifdef UNDER_ALLOCATED_HACK
+ // If caller passed width / 2 for stride, adjust halfwidth to match.
+ if ((src_width & 1) && src_stride_u && src_halfwidth > abs(src_stride_u)) {
+ src_halfwidth = src_width >> 1;
+ }
+ if ((dst_width & 1) && dst_stride_u && dst_halfwidth > abs(dst_stride_u)) {
+ dst_halfwidth = dst_width >> 1;
+ }
+ // If caller used height / 2 when computing src_v, it will point into what
+ // should be the src_u plane. Detect this and reduce halfheight to match.
+ int uv_src_plane_size = src_halfwidth * src_halfheight;
+ if ((src_height & 1) &&
+ (src_v > src_u) && (src_v < (src_u + uv_src_plane_size))) {
+ src_halfheight = src_height >> 1;
+ }
+ int uv_dst_plane_size = dst_halfwidth * dst_halfheight;
+ if ((dst_height & 1) &&
+ (dst_v > dst_u) && (dst_v < (dst_u + uv_dst_plane_size))) {
+ dst_halfheight = dst_height >> 1;
+ }
+#endif
+
ScalePlane(src_y, src_stride_y, src_width, src_height,
dst_y, dst_stride_y, dst_width, dst_height,
- filtering, use_reference_impl_);
- ScalePlane(src_u, src_stride_u, halfsrc_width, halfsrc_height,
- dst_u, dst_stride_u, halfdst_width, halfoheight,
- filtering, use_reference_impl_);
- ScalePlane(src_v, src_stride_v, halfsrc_width, halfsrc_height,
- dst_v, dst_stride_v, halfdst_width, halfoheight,
- filtering, use_reference_impl_);
+ filtering);
+ ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
+ dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
+ filtering);
+ ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
+ dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
+ filtering);
return 0;
}
-int Scale(const uint8* src, int src_width, int src_height,
- uint8* dst, int dst_width, int dst_height, int ooffset,
- bool interpolate) {
+// Deprecated api
+LIBYUV_API
+int ScaleOffset(const uint8* src, int src_width, int src_height,
+ uint8* dst, int dst_width, int dst_height, int dst_yoffset,
+ bool interpolate) {
if (!src || src_width <= 0 || src_height <= 0 ||
- !dst || dst_width <= 0 || dst_height <= 0 || ooffset < 0 ||
- ooffset >= dst_height) {
+ !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset < 0 ||
+ dst_yoffset >= dst_height) {
return -1;
}
- ooffset = ooffset & ~1; // chroma requires offset to multiple of 2.
- int halfsrc_width = (src_width + 1) >> 1;
- int halfsrc_height = (src_height + 1) >> 1;
- int halfdst_width = (dst_width + 1) >> 1;
- int halfoheight = (dst_height + 1) >> 1;
- int aheight = dst_height - ooffset * 2; // actual output height
- const uint8* const iyptr = src;
- uint8* oyptr = dst + ooffset * dst_width;
- const uint8* const iuptr = src + src_width * src_height;
- uint8* ouptr = dst + dst_width * dst_height + (ooffset >> 1) * halfdst_width;
- const uint8* const ivptr = src + src_width * src_height +
- halfsrc_width * halfsrc_height;
- uint8* ovptr = dst + dst_width * dst_height + halfdst_width * halfoheight +
- (ooffset >> 1) * halfdst_width;
- return Scale(iyptr, iuptr, ivptr, src_width, halfsrc_width, halfsrc_width,
- src_width, src_height, oyptr, ouptr, ovptr, dst_width,
- halfdst_width, halfdst_width, dst_width, aheight, interpolate);
-}
-
+ dst_yoffset = dst_yoffset & ~1; // chroma requires offset to multiple of 2.
+ int src_halfwidth = (src_width + 1) >> 1;
+ int src_halfheight = (src_height + 1) >> 1;
+ int dst_halfwidth = (dst_width + 1) >> 1;
+ int dst_halfheight = (dst_height + 1) >> 1;
+ int aheight = dst_height - dst_yoffset * 2; // actual output height
+ const uint8* src_y = src;
+ const uint8* src_u = src + src_width * src_height;
+ const uint8* src_v = src + src_width * src_height +
+ src_halfwidth * src_halfheight;
+ uint8* dst_y = dst + dst_yoffset * dst_width;
+ uint8* dst_u = dst + dst_width * dst_height +
+ (dst_yoffset >> 1) * dst_halfwidth;
+ uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +
+ (dst_yoffset >> 1) * dst_halfwidth;
+ return Scale(src_y, src_u, src_v, src_width, src_halfwidth, src_halfwidth,
+ src_width, src_height, dst_y, dst_u, dst_v, dst_width,
+ dst_halfwidth, dst_halfwidth, dst_width, aheight, interpolate);
+}
+
+#ifdef __cplusplus
+} // extern "C"
} // namespace libyuv
+#endif
diff --git a/files/source/scale_argb.cc b/files/source/scale_argb.cc
new file mode 100644
index 00000000..5d4e1ac0
--- /dev/null
+++ b/files/source/scale_argb.cc
@@ -0,0 +1,1035 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h> // For getenv()
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h" // For CopyARGB
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Bilinear SSE2 is disabled.
+#define SSE2_DISABLED 1
+
+// ARGB scaling uses bilinear or point, but not box filter.
+/**
+ * SSE2 downscalers with bilinear interpolation.
+ */
+
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
+
+#define HAS_SCALEARGBROWDOWN2_SSE2
+// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+static void ScaleARGBRowDown2_SSE2(const uint8* src_ptr,
+ ptrdiff_t /* src_stride */,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+
+ align 16
+ wloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ shufps xmm0, xmm1, 0x88
+ sub ecx, 4
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 8x2 rectangle to 4x1.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+static void ScaleARGBRowDown2Int_SSE2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+
+ align 16
+ wloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + esi]
+ movdqa xmm3, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm2 // average rows
+ pavgb xmm1, xmm3
+ movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
+ shufps xmm0, xmm1, 0x88 // even pixels
+ shufps xmm2, xmm1, 0xdd // odd pixels
+ pavgb xmm0, xmm2
+ sub ecx, 4
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
+
+ pop esi
+ ret
+ }
+}
+
+#define HAS_SCALEARGBROWDOWNEVEN_SSE2
+// Reads 4 pixels at a time.
+// Alignment requirement: dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push ebx
+ push edi
+ mov eax, [esp + 8 + 4] // src_ptr
+ // src_stride ignored
+ mov ebx, [esp + 8 + 12] // src_stepx
+ mov edx, [esp + 8 + 16] // dst_ptr
+ mov ecx, [esp + 8 + 20] // dst_width
+ lea ebx, [ebx * 4]
+ lea edi, [ebx + ebx * 2]
+
+ align 16
+ wloop:
+ movd xmm0, [eax]
+ movd xmm1, [eax + ebx]
+ punpckldq xmm0, xmm1
+ movd xmm2, [eax + ebx * 2]
+ movd xmm3, [eax + edi]
+ lea eax, [eax + ebx * 4]
+ punpckldq xmm2, xmm3
+ punpcklqdq xmm0, xmm2
+ sub ecx, 4
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
+
+ pop edi
+ pop ebx
+ ret
+ }
+}
+
+// Blends four 2x2 to 4x1.
+// Alignment requirement: dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+static void ScaleARGBRowDownEvenInt_SSE2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push ebx
+ push esi
+ push edi
+ mov eax, [esp + 12 + 4] // src_ptr
+ mov esi, [esp + 12 + 8] // src_stride
+ mov ebx, [esp + 12 + 12] // src_stepx
+ mov edx, [esp + 12 + 16] // dst_ptr
+ mov ecx, [esp + 12 + 20] // dst_width
+ lea esi, [eax + esi] // row1 pointer
+ lea ebx, [ebx * 4]
+ lea edi, [ebx + ebx * 2]
+
+ align 16
+ wloop:
+ movq xmm0, qword ptr [eax] // row0 4 pairs
+ movhps xmm0, qword ptr [eax + ebx]
+ movq xmm1, qword ptr [eax + ebx * 2]
+ movhps xmm1, qword ptr [eax + edi]
+ lea eax, [eax + ebx * 4]
+ movq xmm2, qword ptr [esi] // row1 4 pairs
+ movhps xmm2, qword ptr [esi + ebx]
+ movq xmm3, qword ptr [esi + ebx * 2]
+ movhps xmm3, qword ptr [esi + edi]
+ lea esi, [esi + ebx * 4]
+ pavgb xmm0, xmm2 // average rows
+ pavgb xmm1, xmm3
+ movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
+ shufps xmm0, xmm1, 0x88 // even pixels
+ shufps xmm2, xmm1, 0xdd // odd pixels
+ pavgb xmm0, xmm2
+ sub ecx, 4
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
+
+ pop edi
+ pop esi
+ pop ebx
+ ret
+ }
+}
+
+// Bilinear row filtering combines 4x2 -> 4x1. SSE2 version.
+#ifndef SSE2_DISABLED
+#define HAS_SCALEARGBFILTERROWS_SSE2_DISABLED
+__declspec(naked) __declspec(align(16))
+void ScaleARGBFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
+ __asm {
+ push esi
+ push edi
+ mov edi, [esp + 8 + 4] // dst_ptr
+ mov esi, [esp + 8 + 8] // src_ptr
+ mov edx, [esp + 8 + 12] // src_stride
+ mov ecx, [esp + 8 + 16] // dst_width
+ mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
+ sub edi, esi
+ cmp eax, 0
+ je xloop1
+ cmp eax, 128
+ je xloop2
+
+ movd xmm5, eax // xmm5 = y fraction
+ punpcklbw xmm5, xmm5
+ punpcklwd xmm5, xmm5
+ pshufd xmm5, xmm5, 0
+ pxor xmm4, xmm4
+
+ // f * row1 + (1 - frac) row0
+ // frac * (row1 - row0) + row0
+ align 16
+ xloop:
+ movdqa xmm0, [esi] // row0
+ movdqa xmm2, [esi + edx] // row1
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, xmm4
+ punpckhbw xmm3, xmm4
+ punpcklbw xmm0, xmm4
+ punpckhbw xmm1, xmm4
+ psubw xmm2, xmm0 // row1 - row0
+ psubw xmm3, xmm1
+ pmulhw xmm2, xmm5 // scale diff
+ pmulhw xmm3, xmm5
+ paddw xmm0, xmm2 // sum rows
+ paddw xmm1, xmm3
+ packuswb xmm0, xmm1
+ sub ecx, 4
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop
+
+ shufps xmm0, xmm0, 0xff
+ movdqa [esi + edi], xmm0 // duplicate last pixel for filtering
+ pop edi
+ pop esi
+ ret
+
+ align 16
+ xloop1:
+ movdqa xmm0, [esi]
+ sub ecx, 4
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop1
+
+ shufps xmm0, xmm0, 0xff
+ movdqa [esi + edi], xmm0
+ pop edi
+ pop esi
+ ret
+
+ align 16
+ xloop2:
+ movdqa xmm0, [esi]
+ pavgb xmm0, [esi + edx]
+ sub ecx, 4
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop2
+
+ shufps xmm0, xmm0, 0xff
+ movdqa [esi + edi], xmm0
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // SSE2_DISABLED
+
+// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version.
+#define HAS_SCALEARGBFILTERROWS_SSSE3
+__declspec(naked) __declspec(align(16))
+void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
+ __asm {
+ push esi
+ push edi
+ mov edi, [esp + 8 + 4] // dst_ptr
+ mov esi, [esp + 8 + 8] // src_ptr
+ mov edx, [esp + 8 + 12] // src_stride
+ mov ecx, [esp + 8 + 16] // dst_width
+ mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
+ sub edi, esi
+ shr eax, 1
+ cmp eax, 0
+ je xloop1
+ cmp eax, 64
+ je xloop2
+ movd xmm0, eax // high fraction 0..127
+ neg eax
+ add eax, 128
+ movd xmm5, eax // low fraction 128..1
+ punpcklbw xmm5, xmm0
+ punpcklwd xmm5, xmm5
+ pshufd xmm5, xmm5, 0
+
+ align 16
+ xloop:
+ movdqa xmm0, [esi]
+ movdqa xmm2, [esi + edx]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm2
+ punpckhbw xmm1, xmm2
+ pmaddubsw xmm0, xmm5
+ pmaddubsw xmm1, xmm5
+ psrlw xmm0, 7
+ psrlw xmm1, 7
+ packuswb xmm0, xmm1
+ sub ecx, 4
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop
+
+ shufps xmm0, xmm0, 0xff
+ movdqa [esi + edi], xmm0 // duplicate last pixel for filtering
+ pop edi
+ pop esi
+ ret
+
+ align 16
+ xloop1:
+ movdqa xmm0, [esi]
+ sub ecx, 4
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop1
+
+ shufps xmm0, xmm0, 0xff
+ movdqa [esi + edi], xmm0
+ pop edi
+ pop esi
+ ret
+
+ align 16
+ xloop2:
+ movdqa xmm0, [esi]
+ pavgb xmm0, [esi + edx]
+ sub ecx, 4
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop2
+
+ shufps xmm0, xmm0, 0xff
+ movdqa [esi + edi], xmm0
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
+
+// GCC versions of row functions are verbatim conversions from Visual C.
+// Generated using gcc disassembly on Visual C object file:
+// objdump -D yuvscaler.obj >yuvscaler.txt
+#define HAS_SCALEARGBROWDOWN2_SSE2
+static void ScaleARGBRowDown2_SSE2(const uint8* src_ptr,
+ ptrdiff_t /* src_stride */,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+
+static void ScaleARGBRowDown2Int_SSE2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa (%0,%3,1),%%xmm2 \n"
+ "movdqa 0x10(%0,%3,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(static_cast<intptr_t>(src_stride)) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
+ );
+}
+
+#define HAS_SCALEARGBROWDOWNEVEN_SSE2
+// Reads 4 pixels at a time.
+// Alignment requirement: dst_ptr 16 byte aligned.
+void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_ptr, int dst_width) {
+ intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx);
+ intptr_t src_stepx_x12 = 0;
+ asm volatile (
+ "lea 0x0(,%1,4),%1 \n"
+ "lea (%1,%1,2),%4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movd (%0),%%xmm0 \n"
+ "movd (%0,%1,1),%%xmm1 \n"
+ "punpckldq %%xmm1,%%xmm0 \n"
+ "movd (%0,%1,2),%%xmm2 \n"
+ "movd (%0,%4,1),%%xmm3 \n"
+ "lea (%0,%1,4),%0 \n"
+ "punpckldq %%xmm3,%%xmm2 \n"
+ "punpcklqdq %%xmm2,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqa %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stepx_x4), // %1
+ "+r"(dst_ptr), // %2
+ "+r"(dst_width), // %3
+ "+r"(src_stepx_x12) // %4
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
+ );
+}
+
+// Blends four 2x2 to 4x1.
+// Alignment requirement: dst_ptr 16 byte aligned.
+static void ScaleARGBRowDownEvenInt_SSE2(const uint8* src_ptr,
+ ptrdiff_t src_stride, int src_stepx,
+ uint8* dst_ptr, int dst_width) {
+ intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx);
+ intptr_t src_stepx_x12 = 0;
+ intptr_t row1 = static_cast<intptr_t>(src_stride);
+ asm volatile (
+ "lea 0x0(,%1,4),%1 \n"
+ "lea (%1,%1,2),%4 \n"
+ "lea (%0,%5,1),%5 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movq (%0),%%xmm0 \n"
+ "movhps (%0,%1,1),%%xmm0 \n"
+ "movq (%0,%1,2),%%xmm1 \n"
+ "movhps (%0,%4,1),%%xmm1 \n"
+ "lea (%0,%1,4),%0 \n"
+ "movq (%5),%%xmm2 \n"
+ "movhps (%5,%1,1),%%xmm2 \n"
+ "movq (%5,%1,2),%%xmm3 \n"
+ "movhps (%5,%4,1),%%xmm3 \n"
+ "lea (%5,%1,4),%5 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqa %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stepx_x4), // %1
+ "+r"(dst_ptr), // %2
+ "+rm"(dst_width), // %3
+ "+r"(src_stepx_x12), // %4
+ "+r"(row1) // %5
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
+ );
+}
+
+#ifndef SSE2_DISABLED
+// Bilinear row filtering combines 4x2 -> 4x1. SSE2 version
+#define HAS_SCALEARGBFILTERROWS_SSE2_DISABLED
+void ScaleARGBFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
+ asm volatile (
+ "sub %1,%0 \n"
+ "cmp $0x0,%3 \n"
+ "je 2f \n"
+ "cmp $0x80,%3 \n"
+ "je 3f \n"
+ "movd %3,%%xmm5 \n"
+ "punpcklbw %%xmm5,%%xmm5 \n"
+ "punpcklwd %%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%1),%%xmm0 \n"
+ "movdqa (%1,%4,1),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklbw %%xmm4,%%xmm2 \n"
+ "punpckhbw %%xmm4,%%xmm3 \n"
+ "punpcklbw %%xmm4,%%xmm0 \n"
+ "punpckhbw %%xmm4,%%xmm1 \n"
+ "psubw %%xmm0,%%xmm2 \n"
+ "psubw %%xmm1,%%xmm3 \n"
+ "pmulhw %%xmm5,%%xmm2 \n"
+ "pmulhw %%xmm5,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ "jmp 4f \n"
+ ".p2align 4 \n"
+ "2: \n"
+ "movdqa (%1),%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 2b \n"
+ "jmp 4f \n"
+ ".p2align 4 \n"
+ "3: \n"
+ "movdqa (%1),%%xmm0 \n"
+ "pavgb (%1,%4,1),%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 3b \n"
+ ".p2align 4 \n"
+ "4: \n"
+ "shufps $0xff,%%xmm0,%%xmm0 \n"
+ "movdqa %%xmm0,(%1,%0,1) \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(source_y_fraction) // %3
+ : "r"(static_cast<intptr_t>(src_stride)) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+#endif // SSE2_DISABLED
+
+// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
+#define HAS_SCALEARGBFILTERROWS_SSSE3
+void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
+ asm volatile (
+ "sub %1,%0 \n"
+ "shr %3 \n"
+ "cmp $0x0,%3 \n"
+ "je 2f \n"
+ "cmp $0x40,%3 \n"
+ "je 3f \n"
+ "movd %3,%%xmm0 \n"
+ "neg %3 \n"
+ "add $0x80,%3 \n"
+ "movd %3,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n"
+ "punpcklwd %%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%1),%%xmm0 \n"
+ "movdqa (%1,%4,1),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "pmaddubsw %%xmm5,%%xmm0 \n"
+ "pmaddubsw %%xmm5,%%xmm1 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ "jmp 4f \n"
+ ".p2align 4 \n"
+ "2: \n"
+ "movdqa (%1),%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 2b \n"
+ "jmp 4f \n"
+ ".p2align 4 \n"
+ "3: \n"
+ "movdqa (%1),%%xmm0 \n"
+ "pavgb (%1,%4,1),%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 3b \n"
+ "4: \n"
+ ".p2align 4 \n"
+ "shufps $0xff,%%xmm0,%%xmm0 \n"
+ "movdqa %%xmm0,(%1,%0,1) \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(source_y_fraction) // %3
+ : "r"(static_cast<intptr_t>(src_stride)) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm5"
+#endif
+ );
+}
+#endif // defined(__x86_64__) || defined(__i386__)
+
+static void ScaleARGBRowDown2_C(const uint8* src_ptr,
+ ptrdiff_t /* src_stride */,
+ uint8* dst_ptr, int dst_width) {
+ const uint32* src = reinterpret_cast<const uint32*>(src_ptr);
+ uint32* dst = reinterpret_cast<uint32*>(dst_ptr);
+
+ for (int x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src[0];
+ dst[1] = src[2];
+ src += 4;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[0];
+ }
+}
+
+static void ScaleARGBRowDown2Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ for (int x = 0; x < dst_width; ++x) {
+ dst_ptr[0] = (src_ptr[0] + src_ptr[4] +
+ src_ptr[src_stride] + src_ptr[src_stride + 4] + 2) >> 2;
+ dst_ptr[1] = (src_ptr[1] + src_ptr[5] +
+ src_ptr[src_stride + 1] + src_ptr[src_stride + 5] + 2) >> 2;
+ dst_ptr[2] = (src_ptr[2] + src_ptr[6] +
+ src_ptr[src_stride + 2] + src_ptr[src_stride + 6] + 2) >> 2;
+ dst_ptr[3] = (src_ptr[3] + src_ptr[7] +
+ src_ptr[src_stride + 3] + src_ptr[src_stride + 7] + 2) >> 2;
+ src_ptr += 8;
+ dst_ptr += 4;
+ }
+}
+
+void ScaleARGBRowDownEven_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ int src_stepx,
+ uint8* dst_ptr, int dst_width) {
+ const uint32* src = reinterpret_cast<const uint32*>(src_ptr);
+ uint32* dst = reinterpret_cast<uint32*>(dst_ptr);
+
+ for (int x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src[0];
+ dst[1] = src[src_stepx];
+ src += src_stepx * 2;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[0];
+ }
+}
+
+static void ScaleARGBRowDownEvenInt_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_ptr, int dst_width) {
+ for (int x = 0; x < dst_width; ++x) {
+ dst_ptr[0] = (src_ptr[0] + src_ptr[4] +
+ src_ptr[src_stride] + src_ptr[src_stride + 4] + 2) >> 2;
+ dst_ptr[1] = (src_ptr[1] + src_ptr[5] +
+ src_ptr[src_stride + 1] + src_ptr[src_stride + 5] + 2) >> 2;
+ dst_ptr[2] = (src_ptr[2] + src_ptr[6] +
+ src_ptr[src_stride + 2] + src_ptr[src_stride + 6] + 2) >> 2;
+ dst_ptr[3] = (src_ptr[3] + src_ptr[7] +
+ src_ptr[src_stride + 3] + src_ptr[src_stride + 7] + 2) >> 2;
+ src_ptr += src_stepx * 4;
+ dst_ptr += 4;
+ }
+}
+
+// (1-f)a + fb can be replaced with a + f(b-a)
+
+#define BLENDER1(a, b, f) (static_cast<int>(a) + \
+ ((f) * (static_cast<int>(b) - static_cast<int>(a)) >> 16))
+
+#define BLENDERC(a, b, f, s) static_cast<uint32>( \
+ BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
+
+#define BLENDER(a, b, f) \
+ BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \
+ BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
+
+static void ScaleARGBFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx) {
+ const uint32* src = reinterpret_cast<const uint32*>(src_ptr);
+ uint32* dst = reinterpret_cast<uint32*>(dst_ptr);
+ for (int j = 0; j < dst_width - 1; j += 2) {
+ int xi = x >> 16;
+ uint32 a = src[xi];
+ uint32 b = src[xi + 1];
+ dst[0] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ xi = x >> 16;
+ a = src[xi];
+ b = src[xi + 1];
+ dst[1] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ int xi = x >> 16;
+ uint32 a = src[xi];
+ uint32 b = src[xi + 1];
+ dst[0] = BLENDER(a, b, x & 0xffff);
+ }
+}
+
+static const int kMaxInputWidth = 2560;
+
+// C version 2x2 -> 2x1
+void ScaleARGBFilterRows_C(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width, int source_y_fraction) {
+ assert(dst_width > 0);
+ int y1_fraction = source_y_fraction;
+ int y0_fraction = 256 - y1_fraction;
+ const uint8* src_ptr1 = src_ptr + src_stride;
+ uint8* end = dst_ptr + (dst_width << 2);
+ do {
+ dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+ dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
+ dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
+ dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
+ dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8;
+ dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8;
+ dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8;
+ dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8;
+ src_ptr += 8;
+ src_ptr1 += 8;
+ dst_ptr += 8;
+ } while (dst_ptr < end);
+ // Duplicate the last pixel (4 bytes) for filtering.
+ dst_ptr[0] = dst_ptr[-4];
+ dst_ptr[1] = dst_ptr[-3];
+ dst_ptr[2] = dst_ptr[-2];
+ dst_ptr[3] = dst_ptr[-1];
+}
+
+/**
+ * ScaleARGB ARGB, 1/2
+ *
+ * This is an optimized version for scaling down a ARGB to 1/2 of
+ * its original size.
+ *
+ */
+static void ScaleARGBDown2(int /* src_width */, int /* src_height */,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr,
+ FilterMode filtering) {
+ void (*ScaleARGBRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) =
+ filtering ? ScaleARGBRowDown2Int_C : ScaleARGBRowDown2_C;
+#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(dst_width, 4) &&
+ IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+ ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Int_SSE2 :
+ ScaleARGBRowDown2_SSE2;
+ }
+#endif
+
+ // TODO(fbarchard): Loop through source height to allow odd height.
+ for (int y = 0; y < dst_height; ++y) {
+ ScaleARGBRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
+ src_ptr += (src_stride << 1);
+ dst_ptr += dst_stride;
+ }
+}
+
+/**
+ * ScaleARGB ARGB Even
+ *
+ * This is an optimized version for scaling down a ARGB to even
+ * multiple of its original size.
+ *
+ */
+static void ScaleARGBDownEven(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr,
+ FilterMode filtering) {
+ assert(IS_ALIGNED(src_width, 2));
+ assert(IS_ALIGNED(src_height, 2));
+ void (*ScaleARGBRowDownEven)(const uint8* src_ptr, ptrdiff_t src_stride,
+ int src_step, uint8* dst_ptr, int dst_width) =
+ filtering ? ScaleARGBRowDownEvenInt_C : ScaleARGBRowDownEven_C;
+#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(dst_width, 4) &&
+ IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+ ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenInt_SSE2 :
+ ScaleARGBRowDownEven_SSE2;
+ }
+#endif
+ int src_step = src_width / dst_width;
+ // Adjust to point to center of box.
+ int row_step = src_height / dst_height;
+ int row_stride = row_step * src_stride;
+ src_ptr += ((row_step >> 1) - 1) * src_stride + ((src_step >> 1) - 1) * 4;
+ for (int y = 0; y < dst_height; ++y) {
+ ScaleARGBRowDownEven(src_ptr, src_stride, src_step, dst_ptr, dst_width);
+ src_ptr += row_stride;
+ dst_ptr += dst_stride;
+ }
+}
+/**
+ * ScaleARGB ARGB to/from any dimensions, with bilinear
+ * interpolation.
+ */
+
+static void ScaleARGBBilinear(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr) {
+ assert(dst_width > 0);
+ assert(dst_height > 0);
+ assert(src_width <= kMaxInputWidth);
+ SIMD_ALIGNED(uint8 row[kMaxInputWidth * 4 + 16]);
+ void (*ScaleARGBFilterRows)(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width, int source_y_fraction) =
+ ScaleARGBFilterRows_C;
+#if defined(HAS_SCALEARGBFILTERROWS_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) {
+ ScaleARGBFilterRows = ScaleARGBFilterRows_SSE2;
+ }
+#endif
+#if defined(HAS_SCALEARGBFILTERROWS_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) &&
+ IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) {
+ ScaleARGBFilterRows = ScaleARGBFilterRows_SSSE3;
+ }
+#endif
+ int dx = (src_width << 16) / dst_width;
+ int dy = (src_height << 16) / dst_height;
+ int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
+ int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
+ int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+ for (int j = 0; j < dst_height; ++j) {
+ int yi = y >> 16;
+ int yf = (y >> 8) & 255;
+ const uint8* src = src_ptr + yi * src_stride;
+ ScaleARGBFilterRows(row, src, src_stride, src_width, yf);
+ ScaleARGBFilterCols_C(dst_ptr, row, dst_width, x, dx);
+ dst_ptr += dst_stride;
+ y += dy;
+ if (y > maxy) {
+ y = maxy;
+ }
+ }
+}
+
+// Scales a single row of pixels using point sampling.
+// Code is adapted from libyuv bilinear yuv scaling, but with bilinear
+// interpolation off, and argb pixels instead of yuv.
+static void ScaleARGBCols(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx) {
+ const uint32* src = reinterpret_cast<const uint32*>(src_ptr);
+ uint32* dst = reinterpret_cast<uint32*>(dst_ptr);
+ for (int j = 0; j < dst_width - 1; j += 2) {
+ dst[0] = src[x >> 16];
+ x += dx;
+ dst[1] = src[x >> 16];
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[x >> 16];
+ }
+}
+
+/**
+ * ScaleARGB ARGB to/from any dimensions, without interpolation.
+ * Fixed point math is used for performance: The upper 16 bits
+ * of x and dx is the integer part of the source position and
+ * the lower 16 bits are the fixed decimal part.
+ */
+
+static void ScaleARGBSimple(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr) {
+ int dx = (src_width << 16) / dst_width;
+ int dy = (src_height << 16) / dst_height;
+ int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
+ int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
+ for (int i = 0; i < dst_height; ++i) {
+ ScaleARGBCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
+ dst_ptr += dst_stride;
+ y += dy;
+ }
+}
+
+/**
+ * ScaleARGB ARGB to/from any dimensions.
+ */
+static void ScaleARGBAnySize(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr,
+ FilterMode filtering) {
+ if (!filtering || (src_width > kMaxInputWidth)) {
+ ScaleARGBSimple(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src_ptr, dst_ptr);
+ } else {
+ ScaleARGBBilinear(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src_ptr, dst_ptr);
+ }
+}
+
+// ScaleARGB a ARGB.
+//
+// This function in turn calls a scaling function
+// suitable for handling the desired resolutions.
+
+static void ScaleARGB(const uint8* src, int src_stride,
+ int src_width, int src_height,
+ uint8* dst, int dst_stride,
+ int dst_width, int dst_height,
+ FilterMode filtering) {
+#ifdef CPU_X86
+ // environment variable overrides for testing.
+ char *filter_override = getenv("LIBYUV_FILTER");
+ if (filter_override) {
+ filtering = (FilterMode)atoi(filter_override); // NOLINT
+ }
+#endif
+ if (dst_width == src_width && dst_height == src_height) {
+ // Straight copy.
+ ARGBCopy(src, src_stride, dst, dst_stride, dst_width, dst_height);
+ return;
+ }
+ if (2 * dst_width == src_width && 2 * dst_height == src_height) {
+ // Optimized 1/2.
+ ScaleARGBDown2(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ int scale_down_x = src_width / dst_width;
+ int scale_down_y = src_height / dst_height;
+ if (dst_width * scale_down_x == src_width &&
+ dst_height * scale_down_y == src_height) {
+ if (!(scale_down_x & 1) && !(scale_down_y & 1)) {
+ // Optimized even scale down. ie 4, 6, 8, 10x
+ ScaleARGBDownEven(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ if ((scale_down_x & 1) && (scale_down_y & 1)) {
+ filtering = kFilterNone;
+ }
+ }
+ // Arbitrary scale up and/or down.
+ ScaleARGBAnySize(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+}
+
+// ScaleARGB an ARGB image.
+LIBYUV_API
+int ARGBScale(const uint8* src_argb, int src_stride_argb,
+ int src_width, int src_height,
+ uint8* dst_argb, int dst_stride_argb,
+ int dst_width, int dst_height,
+ FilterMode filtering) {
+ if (!src_argb || src_width <= 0 || src_height == 0 ||
+ !dst_argb || dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (src_height < 0) {
+ src_height = -src_height;
+ src_argb = src_argb + (src_height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
+ dst_argb, dst_stride_argb, dst_width, dst_height,
+ filtering);
+ return 0;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/scale_neon.cc b/files/source/scale_neon.cc
new file mode 100644
index 00000000..a1946f05
--- /dev/null
+++ b/files/source/scale_neon.cc
@@ -0,0 +1,534 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
+
+/**
+ * NEON downscalers with interpolation.
+ *
+ * Provided by Fritz Koenig
+ *
+ */
+
+void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst, int dst_width) {
+ asm volatile (
+ "1: \n"
+ // load even pixels into q0, odd into q1
+ "vld2.u8 {q0,q1}, [%0]! \n"
+ "vst1.u8 {q0}, [%1]! \n" // store even pixels
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "q0", "q1" // Clobber List
+ );
+}
+
+void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ asm volatile (
+ // change the stride to row 2 pointer
+ "add %1, %0 \n"
+ "1: \n"
+ "vld1.u8 {q0,q1}, [%0]! \n" // load row 1 and post inc
+ "vld1.u8 {q2,q3}, [%1]! \n" // load row 2 and post inc
+ "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
+ "vpaddl.u8 q1, q1 \n"
+ "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1
+ "vpadal.u8 q1, q3 \n"
+ "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
+ "vrshrn.u16 d1, q1, #2 \n"
+ "vst1.u8 {q0}, [%2]! \n"
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "1: \n"
+ "vld2.u8 {d0, d1}, [%0]! \n"
+ "vtrn.u8 d1, d0 \n"
+ "vshrn.u16 d0, q0, #8 \n"
+ "vst1.u32 {d0[1]}, [%1]! \n"
+ "subs %2, #4 \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "q0", "q1", "memory", "cc"
+ );
+}
+
+void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "add r4, %0, %3 \n"
+ "add r5, r4, %3 \n"
+ "add %3, r5, %3 \n"
+ "1: \n"
+ "vld1.u8 {q0}, [%0]! \n" // load up 16x4
+ "vld1.u8 {q1}, [r4]! \n"
+ "vld1.u8 {q2}, [r5]! \n"
+ "vld1.u8 {q3}, [%3]! \n"
+ "vpaddl.u8 q0, q0 \n"
+ "vpadal.u8 q0, q1 \n"
+ "vpadal.u8 q0, q2 \n"
+ "vpadal.u8 q0, q3 \n"
+ "vpaddl.u16 q0, q0 \n"
+ "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
+ "vmovn.u16 d0, q0 \n"
+ "vst1.u32 {d0[0]}, [%1]! \n"
+ "subs %2, #4 \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(src_stride) // %3
+ : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"
+ );
+}
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8* src_ptr,
+ ptrdiff_t /* src_stride */,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "1: \n"
+ "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "vmov d2, d3 \n" // order d0, d1, d2
+ "vst3.u8 {d0, d1, d2}, [%1]! \n"
+ "subs %2, #24 \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "d0", "d1", "d2", "d3", "memory", "cc"
+ );
+}
+
+void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "vmov.u8 d24, #3 \n"
+ "add %3, %0 \n"
+ "1: \n"
+ "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
+
+ // filter src line 0 with src line 1
+ // expand chars to shorts to allow for room
+ // when adding lines together
+ "vmovl.u8 q8, d4 \n"
+ "vmovl.u8 q9, d5 \n"
+ "vmovl.u8 q10, d6 \n"
+ "vmovl.u8 q11, d7 \n"
+
+ // 3 * line_0 + line_1
+ "vmlal.u8 q8, d0, d24 \n"
+ "vmlal.u8 q9, d1, d24 \n"
+ "vmlal.u8 q10, d2, d24 \n"
+ "vmlal.u8 q11, d3, d24 \n"
+
+ // (3 * line_0 + line_1) >> 2
+ "vqrshrn.u16 d0, q8, #2 \n"
+ "vqrshrn.u16 d1, q9, #2 \n"
+ "vqrshrn.u16 d2, q10, #2 \n"
+ "vqrshrn.u16 d3, q11, #2 \n"
+
+ // a0 = (src[0] * 3 + s[1] * 1) >> 2
+ "vmovl.u8 q8, d1 \n"
+ "vmlal.u8 q8, d0, d24 \n"
+ "vqrshrn.u16 d0, q8, #2 \n"
+
+ // a1 = (src[1] * 1 + s[2] * 1) >> 1
+ "vrhadd.u8 d1, d1, d2 \n"
+
+ // a2 = (src[2] * 1 + s[3] * 3) >> 2
+ "vmovl.u8 q8, d2 \n"
+ "vmlal.u8 q8, d3, d24 \n"
+ "vqrshrn.u16 d2, q8, #2 \n"
+
+ "vst3.u8 {d0, d1, d2}, [%1]! \n"
+
+ "subs %2, #24 \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ :
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
+ );
+}
+
+void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "vmov.u8 d24, #3 \n"
+ "add %3, %0 \n"
+ "1: \n"
+ "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
+
+ // average src line 0 with src line 1
+ "vrhadd.u8 q0, q0, q2 \n"
+ "vrhadd.u8 q1, q1, q3 \n"
+
+ // a0 = (src[0] * 3 + s[1] * 1) >> 2
+ "vmovl.u8 q3, d1 \n"
+ "vmlal.u8 q3, d0, d24 \n"
+ "vqrshrn.u16 d0, q3, #2 \n"
+
+ // a1 = (src[1] * 1 + s[2] * 1) >> 1
+ "vrhadd.u8 d1, d1, d2 \n"
+
+ // a2 = (src[2] * 1 + s[3] * 3) >> 2
+ "vmovl.u8 q3, d2 \n"
+ "vmlal.u8 q3, d3, d24 \n"
+ "vqrshrn.u16 d2, q3, #2 \n"
+
+ "vst3.u8 {d0, d1, d2}, [%1]! \n"
+
+ "subs %2, #24 \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ :
+ : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
+ );
+}
+
+#define HAS_SCALEROWDOWN38_NEON
+const uvec8 kShuf38 =
+ { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
+const uvec8 kShuf38_2 =
+ { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
+const vec16 kMult38_Div6 =
+ { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
+ 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
+const vec16 kMult38_Div9 =
+ { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
+ 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8* src_ptr,
+ ptrdiff_t /* src_stride */,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "vld1.u8 {q3}, [%3] \n"
+ "1: \n"
+ "vld1.u8 {d0, d1, d2, d3}, [%0]! \n"
+ "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
+ "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
+ "vst1.u8 {d4}, [%1]! \n"
+ "vst1.u32 {d5[0]}, [%1]! \n"
+ "subs %2, #12 \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(&kShuf38) // %3
+ : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
+ );
+}
+
+// 32x3 -> 12x1
+void OMITFP ScaleRowDown38_3_Int_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "vld1.u16 {q13}, [%4] \n"
+ "vld1.u8 {q14}, [%5] \n"
+ "vld1.u8 {q15}, [%6] \n"
+ "add r4, %0, %3, lsl #1 \n"
+ "add %3, %0 \n"
+ "1: \n"
+
+ // d0 = 00 40 01 41 02 42 03 43
+ // d1 = 10 50 11 51 12 52 13 53
+ // d2 = 20 60 21 61 22 62 23 63
+ // d3 = 30 70 31 71 32 72 33 73
+ "vld4.u8 {d0, d1, d2, d3}, [%0]! \n"
+ "vld4.u8 {d4, d5, d6, d7}, [%3]! \n"
+ "vld4.u8 {d16, d17, d18, d19}, [r4]! \n"
+
+ // Shuffle the input data around to get align the data
+ // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+ // d0 = 00 10 01 11 02 12 03 13
+ // d1 = 40 50 41 51 42 52 43 53
+ "vtrn.u8 d0, d1 \n"
+ "vtrn.u8 d4, d5 \n"
+ "vtrn.u8 d16, d17 \n"
+
+ // d2 = 20 30 21 31 22 32 23 33
+ // d3 = 60 70 61 71 62 72 63 73
+ "vtrn.u8 d2, d3 \n"
+ "vtrn.u8 d6, d7 \n"
+ "vtrn.u8 d18, d19 \n"
+
+ // d0 = 00+10 01+11 02+12 03+13
+ // d2 = 40+50 41+51 42+52 43+53
+ "vpaddl.u8 q0, q0 \n"
+ "vpaddl.u8 q2, q2 \n"
+ "vpaddl.u8 q8, q8 \n"
+
+ // d3 = 60+70 61+71 62+72 63+73
+ "vpaddl.u8 d3, d3 \n"
+ "vpaddl.u8 d7, d7 \n"
+ "vpaddl.u8 d19, d19 \n"
+
+ // combine source lines
+ "vadd.u16 q0, q2 \n"
+ "vadd.u16 q0, q8 \n"
+ "vadd.u16 d4, d3, d7 \n"
+ "vadd.u16 d4, d19 \n"
+
+ // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+ // + s[6 + st * 1] + s[7 + st * 1]
+ // + s[6 + st * 2] + s[7 + st * 2]) / 6
+ "vqrdmulh.s16 q2, q2, q13 \n"
+ "vmovn.u16 d4, q2 \n"
+
+ // Shuffle 2,3 reg around so that 2 can be added to the
+ // 0,1 reg and 3 can be added to the 4,5 reg. This
+ // requires expanding from u8 to u16 as the 0,1 and 4,5
+ // registers are already expanded. Then do transposes
+ // to get aligned.
+ // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q3, d6 \n"
+ "vmovl.u8 q9, d18 \n"
+
+ // combine source lines
+ "vadd.u16 q1, q3 \n"
+ "vadd.u16 q1, q9 \n"
+
+ // d4 = xx 20 xx 30 xx 22 xx 32
+ // d5 = xx 21 xx 31 xx 23 xx 33
+ "vtrn.u32 d2, d3 \n"
+
+ // d4 = xx 20 xx 21 xx 22 xx 23
+ // d5 = xx 30 xx 31 xx 32 xx 33
+ "vtrn.u16 d2, d3 \n"
+
+ // 0+1+2, 3+4+5
+ "vadd.u16 q0, q1 \n"
+
+ // Need to divide, but can't downshift as the the value
+ // isn't a power of 2. So multiply by 65536 / n
+ // and take the upper 16 bits.
+ "vqrdmulh.s16 q0, q0, q15 \n"
+
+ // Align for table lookup, vtbl requires registers to
+ // be adjacent
+ "vmov.u8 d2, d4 \n"
+
+ "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
+ "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
+
+ "vst1.u8 {d3}, [%1]! \n"
+ "vst1.u32 {d4[0]}, [%1]! \n"
+ "subs %2, #12 \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ : "r"(&kMult38_Div6), // %4
+ "r"(&kShuf38_2), // %5
+ "r"(&kMult38_Div9) // %6
+ : "r4", "q0", "q1", "q2", "q3", "q8", "q9",
+ "q13", "q14", "q15", "memory", "cc"
+ );
+}
+
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "vld1.u16 {q13}, [%4] \n"
+ "vld1.u8 {q14}, [%5] \n"
+ "add %3, %0 \n"
+ "1: \n"
+
+ // d0 = 00 40 01 41 02 42 03 43
+ // d1 = 10 50 11 51 12 52 13 53
+ // d2 = 20 60 21 61 22 62 23 63
+ // d3 = 30 70 31 71 32 72 33 73
+ "vld4.u8 {d0, d1, d2, d3}, [%0]! \n"
+ "vld4.u8 {d4, d5, d6, d7}, [%3]! \n"
+
+ // Shuffle the input data around to get align the data
+ // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+ // d0 = 00 10 01 11 02 12 03 13
+ // d1 = 40 50 41 51 42 52 43 53
+ "vtrn.u8 d0, d1 \n"
+ "vtrn.u8 d4, d5 \n"
+
+ // d2 = 20 30 21 31 22 32 23 33
+ // d3 = 60 70 61 71 62 72 63 73
+ "vtrn.u8 d2, d3 \n"
+ "vtrn.u8 d6, d7 \n"
+
+ // d0 = 00+10 01+11 02+12 03+13
+ // d2 = 40+50 41+51 42+52 43+53
+ "vpaddl.u8 q0, q0 \n"
+ "vpaddl.u8 q2, q2 \n"
+
+ // d3 = 60+70 61+71 62+72 63+73
+ "vpaddl.u8 d3, d3 \n"
+ "vpaddl.u8 d7, d7 \n"
+
+ // combine source lines
+ "vadd.u16 q0, q2 \n"
+ "vadd.u16 d4, d3, d7 \n"
+
+ // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+ "vqrshrn.u16 d4, q2, #2 \n"
+
+ // Shuffle 2,3 reg around so that 2 can be added to the
+ // 0,1 reg and 3 can be added to the 4,5 reg. This
+ // requires expanding from u8 to u16 as the 0,1 and 4,5
+ // registers are already expanded. Then do transposes
+ // to get aligned.
+ // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q3, d6 \n"
+
+ // combine source lines
+ "vadd.u16 q1, q3 \n"
+
+ // d4 = xx 20 xx 30 xx 22 xx 32
+ // d5 = xx 21 xx 31 xx 23 xx 33
+ "vtrn.u32 d2, d3 \n"
+
+ // d4 = xx 20 xx 21 xx 22 xx 23
+ // d5 = xx 30 xx 31 xx 32 xx 33
+ "vtrn.u16 d2, d3 \n"
+
+ // 0+1+2, 3+4+5
+ "vadd.u16 q0, q1 \n"
+
+ // Need to divide, but can't downshift as the the value
+ // isn't a power of 2. So multiply by 65536 / n
+ // and take the upper 16 bits.
+ "vqrdmulh.s16 q0, q0, q13 \n"
+
+ // Align for table lookup, vtbl requires registers to
+ // be adjacent
+ "vmov.u8 d2, d4 \n"
+
+ "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
+ "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
+
+ "vst1.u8 {d3}, [%1]! \n"
+ "vst1.u32 {d4[0]}, [%1]! \n"
+ "subs %2, #12 \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ : "r"(&kMult38_Div6), // %4
+ "r"(&kShuf38_2) // %5
+ : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
+ );
+}
+
+// 16x2 -> 16x1
+void ScaleFilterRows_NEON(uint8* dst_ptr,
+ const uint8* src_ptr, ptrdiff_t src_stride,
+ int dst_width, int source_y_fraction) {
+ asm volatile (
+ "cmp %4, #0 \n"
+ "beq 2f \n"
+ "add %2, %1 \n"
+ "cmp %4, #128 \n"
+ "beq 3f \n"
+
+ "vdup.8 d5, %4 \n"
+ "rsb %4, #256 \n"
+ "vdup.8 d4, %4 \n"
+ "1: \n"
+ "vld1.u8 {q0}, [%1]! \n"
+ "vld1.u8 {q1}, [%2]! \n"
+ "subs %3, #16 \n"
+ "vmull.u8 q13, d0, d4 \n"
+ "vmull.u8 q14, d1, d4 \n"
+ "vmlal.u8 q13, d2, d5 \n"
+ "vmlal.u8 q14, d3, d5 \n"
+ "vrshrn.u16 d0, q13, #8 \n"
+ "vrshrn.u16 d1, q14, #8 \n"
+ "vst1.u8 {q0}, [%0]! \n"
+ "bgt 1b \n"
+ "b 4f \n"
+
+ "2: \n"
+ "vld1.u8 {q0}, [%1]! \n"
+ "subs %3, #16 \n"
+ "vst1.u8 {q0}, [%0]! \n"
+ "bgt 2b \n"
+ "b 4f \n"
+
+ "3: \n"
+ "vld1.u8 {q0}, [%1]! \n"
+ "vld1.u8 {q1}, [%2]! \n"
+ "subs %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.u8 {q0}, [%0]! \n"
+ "bgt 3b \n"
+ "4: \n"
+ "vst1.u8 {d1[7]}, [%0] \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(src_stride), // %2
+ "+r"(dst_width), // %3
+ "+r"(source_y_fraction) // %4
+ :
+ : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
+ );
+}
+
+#endif // __ARM_NEON__
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
diff --git a/files/source/video_common.cc b/files/source/video_common.cc
index 8b8ee622..616affd1 100644
--- a/files/source/video_common.cc
+++ b/files/source/video_common.cc
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@@ -9,13 +9,14 @@
*/
-#include "video_common.h"
-
-#include <sstream>
+#include "libyuv/video_common.h"
+#ifdef __cplusplus
namespace libyuv {
+extern "C" {
+#endif
-#define ARRAY_SIZE(x) (static_cast<int>((sizeof(x)/sizeof(x[0]))))
+#define ARRAY_SIZE(x) (static_cast<int>((sizeof(x) / sizeof(x[0]))))
struct FourCCAliasEntry {
uint32 alias;
@@ -24,7 +25,8 @@ struct FourCCAliasEntry {
static const FourCCAliasEntry kFourCCAliases[] = {
{FOURCC_IYUV, FOURCC_I420},
- {FOURCC_YU12, FOURCC_I420},
+ {FOURCC_YU16, FOURCC_I422},
+ {FOURCC_YU24, FOURCC_I444},
{FOURCC_YUYV, FOURCC_YUY2},
{FOURCC_YUVS, FOURCC_YUY2},
{FOURCC_HDYC, FOURCC_UYVY},
@@ -35,6 +37,7 @@ static const FourCCAliasEntry kFourCCAliases[] = {
{FOURCC_BGR3, FOURCC_24BG},
};
+LIBYUV_API
uint32 CanonicalFourCC(uint32 fourcc) {
for (int i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) {
if (kFourCCAliases[i].alias == fourcc) {
@@ -45,4 +48,8 @@ uint32 CanonicalFourCC(uint32 fourcc) {
return fourcc;
}
+#ifdef __cplusplus
+} // extern "C"
} // namespace libyuv
+#endif
+
diff --git a/files/source/video_common.h b/files/source/video_common.h
deleted file mode 100644
index 9fe08a03..00000000
--- a/files/source/video_common.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-/*
-* Common definitions for video, including fourcc and VideoFormat
-*/
-
-
-#ifndef LIBYUV_SOURCE_VIDEO_COMMON_H_
-#define LIBYUV_SOURCE_VIDEO_COMMON_H_
-
-#include <string>
-
-#include "libyuv/basic_types.h"
-
-namespace libyuv {
-
-//////////////////////////////////////////////////////////////////////////////
-// Definition of fourcc.
-//////////////////////////////////////////////////////////////////////////////
-// Convert four characters to a fourcc code.
-// Needs to be a macro otherwise the OS X compiler complains when the kFormat*
-// constants are used in a switch.
-#define FOURCC(a, b, c, d) (\
- (static_cast<uint32>(a)) | (static_cast<uint32>(b) << 8) | \
- (static_cast<uint32>(c) << 16) | (static_cast<uint32>(d) << 24))
-
-// Some good pages discussing FourCC codes:
-// http://developer.apple.com/quicktime/icefloe/dispatch020.html
-// http://www.fourcc.org/yuv.php
-enum FourCC {
- // Canonical fourcc codes used in our code.
- FOURCC_I420 = FOURCC('I', '4', '2', '0'),
- FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
- FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
- FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
- FOURCC_M420 = FOURCC('M', '4', '2', '0'),
- FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
- FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
- FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
- FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
- FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
- FOURCC_RAW = FOURCC('r', 'a', 'w', ' '),
- FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
- FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
- // Next four are Bayer RGB formats. The four characters define the order of
- // the colours in each 2x2 pixel grid, going left-to-right and top-to-bottom.
- FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
- FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
- FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
- FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
-
- // Aliases for canonical fourcc codes, replaced with their canonical
- // equivalents by CanonicalFourCC().
- FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420
- FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'), // Alias for I420
- FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'), // Alias for YUY2
- FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'), // Alias for YUY2 on Mac
- FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'), // Alias for UYVY
- FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'), // Alias for UYVY
- FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'), // Alias for MJPG
- FOURCC_BA81 = FOURCC('B', 'A', '8', '1'), // Alias for BGGR
- FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'), // Alias for RAW
- FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'), // Alias for 24BG
-
- // Match any fourcc.
- FOURCC_ANY = 0xFFFFFFFF,
-};
-
-// Converts fourcc aliases into canonical ones.
-uint32 CanonicalFourCC(uint32 fourcc);
-
-} // namespace libyuv
-
-#endif // LIBYUV_SOURCE_VIDEO_COMMON_H_
diff --git a/files/unit_test/compare_test.cc b/files/unit_test/compare_test.cc
new file mode 100644
index 00000000..8a49a612
--- /dev/null
+++ b/files/unit_test/compare_test.cc
@@ -0,0 +1,450 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "../unit_test/unit_test.h"
+#include "libyuv/basic_types.h"
+#include "libyuv/compare.h"
+#include "libyuv/cpu_id.h"
+
+namespace libyuv {
+
+// hash seed of 5381 recommended.
+static uint32 ReferenceHashDjb2(const uint8* src, uint64 count, uint32 seed) {
+ uint32 hash = seed;
+ if (count > 0) {
+ do {
+ hash = hash * 33 + *src++;
+ } while (--count);
+ }
+ return hash;
+}
+
+TEST_F(libyuvTest, TestDjb2) {
+ const int kMaxTest = 2049;
+ align_buffer_16(src_a, kMaxTest)
+
+ for (int i = 0; i < kMaxTest; ++i) {
+ src_a[i] = i;
+ }
+ for (int i = 0; i < kMaxTest; ++i) {
+ uint32 h1 = HashDjb2(src_a, kMaxTest, 5381);
+ uint32 h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381);
+ EXPECT_EQ(h1, h2);
+ }
+ // Hash constant generator using for tables in compare
+ int h = 1;
+ for (int i = 0; i <= 16 ; ++i) {
+ printf("%08x ", h);
+ h *= 33;
+ }
+ printf("\n");
+
+ free_aligned_buffer_16(src_a)
+}
+
+TEST_F(libyuvTest, BenchmakDjb2_C) {
+ const int kMaxTest = 1280 * 720;
+ align_buffer_16(src_a, kMaxTest)
+
+ for (int i = 0; i < kMaxTest; ++i) {
+ src_a[i] = i;
+ }
+ uint32 h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381);
+ uint32 h1;
+ MaskCpuFlags(kCpuInitialized);
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ h1 = HashDjb2(src_a, kMaxTest, 5381);
+ }
+ MaskCpuFlags(-1);
+ EXPECT_EQ(h1, h2);
+ free_aligned_buffer_16(src_a)
+}
+
+TEST_F(libyuvTest, BenchmakDjb2_OPT) {
+ const int kMaxTest = 1280 * 720;
+ align_buffer_16(src_a, kMaxTest)
+
+ for (int i = 0; i < kMaxTest; ++i) {
+ src_a[i] = i;
+ }
+ uint32 h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381);
+ uint32 h1;
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ h1 = HashDjb2(src_a, kMaxTest, 5381);
+ }
+ EXPECT_EQ(h1, h2);
+ free_aligned_buffer_16(src_a)
+}
+
+TEST_F(libyuvTest, BenchmakDjb2_Unaligned_OPT) {
+ const int kMaxTest = 1280 * 720;
+ align_buffer_16(src_a, kMaxTest + 1)
+
+ for (int i = 0; i < kMaxTest; ++i) {
+ src_a[i + 1] = i;
+ }
+ uint32 h2 = ReferenceHashDjb2(src_a + 1, kMaxTest, 5381);
+ uint32 h1;
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ h1 = HashDjb2(src_a + 1, kMaxTest, 5381);
+ }
+ EXPECT_EQ(h1, h2);
+ free_aligned_buffer_16(src_a)
+}
+
+TEST_F(libyuvTest, BenchmarkSumSquareError_C) {
+ const int kMaxWidth = 4096 * 3;
+ align_buffer_16(src_a, kMaxWidth)
+ align_buffer_16(src_b, kMaxWidth)
+
+ for (int i = 0; i < kMaxWidth; ++i) {
+ src_a[i] = i;
+ src_b[i] = i;
+ }
+
+ MaskCpuFlags(kCpuInitialized);
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ ComputeSumSquareError(src_a, src_b, kMaxWidth);
+ }
+
+ MaskCpuFlags(-1);
+
+ EXPECT_EQ(0, 0);
+
+ free_aligned_buffer_16(src_a)
+ free_aligned_buffer_16(src_b)
+}
+
+TEST_F(libyuvTest, BenchmarkSumSquareError_OPT) {
+ const int kMaxWidth = 4096 * 3;
+ align_buffer_16(src_a, kMaxWidth)
+ align_buffer_16(src_b, kMaxWidth)
+
+ for (int i = 0; i < kMaxWidth; ++i) {
+ src_a[i] = i;
+ src_b[i] = i;
+ }
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ ComputeSumSquareError(src_a, src_b, kMaxWidth);
+ }
+
+ EXPECT_EQ(0, 0);
+
+ free_aligned_buffer_16(src_a)
+ free_aligned_buffer_16(src_b)
+}
+
+TEST_F(libyuvTest, SumSquareError) {
+ const int kMaxWidth = 4096 * 3;
+ align_buffer_16(src_a, kMaxWidth)
+ align_buffer_16(src_b, kMaxWidth)
+
+ memset(src_a, 0, kMaxWidth);
+ memset(src_b, 0, kMaxWidth);
+
+ uint64 err;
+ err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
+
+ EXPECT_EQ(err, 0);
+
+ memset(src_a, 1, kMaxWidth);
+ err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
+
+ EXPECT_EQ(err, kMaxWidth);
+
+ memset(src_a, 190, kMaxWidth);
+ memset(src_b, 193, kMaxWidth);
+ err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
+
+ EXPECT_EQ(err, (kMaxWidth * 3 * 3));
+
+ srandom(time(NULL));
+
+ for (int i = 0; i < kMaxWidth; ++i) {
+ src_a[i] = (random() & 0xff);
+ src_b[i] = (random() & 0xff);
+ }
+
+ MaskCpuFlags(kCpuInitialized);
+ uint64 c_err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
+
+ MaskCpuFlags(-1);
+ uint64 opt_err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
+
+ EXPECT_EQ(c_err, opt_err);
+
+ free_aligned_buffer_16(src_a)
+ free_aligned_buffer_16(src_b)
+}
+
+TEST_F(libyuvTest, BenchmarkPsnr_C) {
+ align_buffer_16(src_a, benchmark_width_ * benchmark_height_)
+ align_buffer_16(src_b, benchmark_width_ * benchmark_height_)
+
+ for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+ src_a[i] = i;
+ src_b[i] = i;
+ }
+
+ MaskCpuFlags(kCpuInitialized);
+
+ double c_time = get_time();
+ for (int i = 0; i < benchmark_iterations_; ++i)
+ CalcFramePsnr(src_a, benchmark_width_,
+ src_b, benchmark_width_,
+ benchmark_width_, benchmark_height_);
+
+ c_time = (get_time() - c_time) / benchmark_iterations_;
+ printf("BenchmarkPsnr_C - %8.2f us c\n", c_time * 1e6);
+
+ MaskCpuFlags(-1);
+
+ EXPECT_EQ(0, 0);
+
+ free_aligned_buffer_16(src_a)
+ free_aligned_buffer_16(src_b)
+}
+
+TEST_F(libyuvTest, BenchmarkPsnr_OPT) {
+ align_buffer_16(src_a, benchmark_width_ * benchmark_height_)
+ align_buffer_16(src_b, benchmark_width_ * benchmark_height_)
+
+ for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+ src_a[i] = i;
+ src_b[i] = i;
+ }
+
+ MaskCpuFlags(-1);
+
+ double opt_time = get_time();
+ for (int i = 0; i < benchmark_iterations_; ++i)
+ CalcFramePsnr(src_a, benchmark_width_,
+ src_b, benchmark_width_,
+ benchmark_width_, benchmark_height_);
+
+ opt_time = (get_time() - opt_time) / benchmark_iterations_;
+ printf("BenchmarkPsnr_OPT - %8.2f us opt\n", opt_time * 1e6);
+
+ EXPECT_EQ(0, 0);
+
+ free_aligned_buffer_16(src_a)
+ free_aligned_buffer_16(src_b)
+}
+
+TEST_F(libyuvTest, Psnr) {
+ const int kSrcWidth = 1280;
+ const int kSrcHeight = 720;
+ const int b = 128;
+ const int kSrcPlaneSize = (kSrcWidth + b * 2) * (kSrcHeight + b * 2);
+ const int kSrcStride = 2 * b + kSrcWidth;
+ align_buffer_16(src_a, kSrcPlaneSize)
+ align_buffer_16(src_b, kSrcPlaneSize)
+
+ memset(src_a, 0, kSrcPlaneSize);
+ memset(src_b, 0, kSrcPlaneSize);
+
+ double err;
+ err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride,
+ kSrcWidth, kSrcHeight);
+
+ EXPECT_EQ(err, kMaxPsnr);
+
+ memset(src_a, 255, kSrcPlaneSize);
+
+ err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride,
+ kSrcWidth, kSrcHeight);
+
+ EXPECT_EQ(err, 0.0);
+
+ memset(src_a, 1, kSrcPlaneSize);
+
+ err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride,
+ kSrcWidth, kSrcHeight);
+
+ EXPECT_GT(err, 48.0);
+ EXPECT_LT(err, 49.0);
+
+ for (int i = 0; i < kSrcPlaneSize; ++i)
+ src_a[i] = i;
+
+ err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride,
+ kSrcWidth, kSrcHeight);
+
+ EXPECT_GT(err, 4.0);
+ EXPECT_LT(err, 5.0);
+
+ srandom(time(NULL));
+
+ memset(src_a, 0, kSrcPlaneSize);
+ memset(src_b, 0, kSrcPlaneSize);
+
+ for (int i = b; i < (kSrcHeight + b); ++i) {
+ for (int j = b; j < (kSrcWidth + b); ++j) {
+ src_a[(i * kSrcStride) + j] = (random() & 0xff);
+ src_b[(i * kSrcStride) + j] = (random() & 0xff);
+ }
+ }
+
+ MaskCpuFlags(kCpuInitialized);
+ double c_err, opt_err;
+
+ c_err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride,
+ kSrcWidth, kSrcHeight);
+
+ MaskCpuFlags(-1);
+
+ opt_err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride,
+ kSrcWidth, kSrcHeight);
+
+ EXPECT_EQ(opt_err, c_err);
+
+ free_aligned_buffer_16(src_a)
+ free_aligned_buffer_16(src_b)
+}
+
+TEST_F(libyuvTest, BenchmarkSsim_C) {
+ align_buffer_16(src_a, benchmark_width_ * benchmark_height_)
+ align_buffer_16(src_b, benchmark_width_ * benchmark_height_)
+
+ for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+ src_a[i] = i;
+ src_b[i] = i;
+ }
+
+ MaskCpuFlags(kCpuInitialized);
+
+ double c_time = get_time();
+ for (int i = 0; i < benchmark_iterations_; ++i)
+ CalcFrameSsim(src_a, benchmark_width_,
+ src_b, benchmark_width_,
+ benchmark_width_, benchmark_height_);
+
+ c_time = (get_time() - c_time) / benchmark_iterations_;
+ printf("BenchmarkSsim_C - %8.2f us c\n", c_time * 1e6);
+
+ MaskCpuFlags(-1);
+
+ EXPECT_EQ(0, 0);
+
+ free_aligned_buffer_16(src_a)
+ free_aligned_buffer_16(src_b)
+}
+
+TEST_F(libyuvTest, BenchmarkSsim_OPT) {
+ align_buffer_16(src_a, benchmark_width_ * benchmark_height_)
+ align_buffer_16(src_b, benchmark_width_ * benchmark_height_)
+
+ for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+ src_a[i] = i;
+ src_b[i] = i;
+ }
+
+ MaskCpuFlags(-1);
+
+ double opt_time = get_time();
+ for (int i = 0; i < benchmark_iterations_; ++i)
+ CalcFrameSsim(src_a, benchmark_width_,
+ src_b, benchmark_width_,
+ benchmark_width_, benchmark_height_);
+
+ opt_time = (get_time() - opt_time) / benchmark_iterations_;
+ printf("BenchmarkPsnr_OPT - %8.2f us opt\n", opt_time * 1e6);
+
+ EXPECT_EQ(0, 0);
+
+ free_aligned_buffer_16(src_a)
+ free_aligned_buffer_16(src_b)
+}
+
+TEST_F(libyuvTest, Ssim) {
+ const int kSrcWidth = 1280;
+ const int kSrcHeight = 720;
+ const int b = 128;
+ const int kSrcPlaneSize = (kSrcWidth + b * 2) * (kSrcHeight + b * 2);
+ const int kSrcStride = 2 * b + kSrcWidth;
+ align_buffer_16(src_a, kSrcPlaneSize)
+ align_buffer_16(src_b, kSrcPlaneSize)
+
+ memset(src_a, 0, kSrcPlaneSize);
+ memset(src_b, 0, kSrcPlaneSize);
+
+ double err;
+ err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride,
+ kSrcWidth, kSrcHeight);
+
+ EXPECT_EQ(err, 1.0);
+
+ memset(src_a, 255, kSrcPlaneSize);
+
+ err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride,
+ kSrcWidth, kSrcHeight);
+
+ EXPECT_LT(err, 0.0001);
+
+ memset(src_a, 1, kSrcPlaneSize);
+
+ err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride,
+ kSrcWidth, kSrcHeight);
+
+ EXPECT_GT(err, 0.8);
+ EXPECT_LT(err, 0.9);
+
+ for (int i = 0; i < kSrcPlaneSize; ++i)
+ src_a[i] = i;
+
+ err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride,
+ kSrcWidth, kSrcHeight);
+
+ EXPECT_GT(err, 0.008);
+ EXPECT_LT(err, 0.009);
+
+ srandom(time(NULL));
+ for (int i = b; i < (kSrcHeight + b); ++i) {
+ for (int j = b; j < (kSrcWidth + b); ++j) {
+ src_a[(i * kSrcStride) + j] = (random() & 0xff);
+ src_b[(i * kSrcStride) + j] = (random() & 0xff);
+ }
+ }
+
+ MaskCpuFlags(kCpuInitialized);
+ double c_err, opt_err;
+
+ c_err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride,
+ kSrcWidth, kSrcHeight);
+
+ MaskCpuFlags(-1);
+
+ opt_err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride,
+ kSrcWidth, kSrcHeight);
+
+ EXPECT_EQ(opt_err, c_err);
+
+ free_aligned_buffer_16(src_a)
+ free_aligned_buffer_16(src_b)
+}
+
+} // namespace libyuv
diff --git a/files/unit_test/cpu_test.cc b/files/unit_test/cpu_test.cc
new file mode 100644
index 00000000..52810e80
--- /dev/null
+++ b/files/unit_test/cpu_test.cc
@@ -0,0 +1,100 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/version.h"
+#include "../unit_test/unit_test.h"
+
+namespace libyuv {
+
+TEST_F(libyuvTest, TestCpuHas) {
+ int cpu_flags = TestCpuFlag(~kCpuInitialized);
+ printf("Cpu Flags %x\n", cpu_flags);
+ int has_arm = TestCpuFlag(kCpuHasARM);
+ printf("Has ARM %x\n", has_arm);
+ int has_neon = TestCpuFlag(kCpuHasNEON);
+ printf("Has NEON %x\n", has_neon);
+ int has_x86 = TestCpuFlag(kCpuHasX86);
+ printf("Has X86 %x\n", has_x86);
+ int has_sse2 = TestCpuFlag(kCpuHasSSE2);
+ printf("Has SSE2 %x\n", has_sse2);
+ int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
+ printf("Has SSSE3 %x\n", has_ssse3);
+ int has_sse41 = TestCpuFlag(kCpuHasSSE41);
+ printf("Has SSE4.1 %x\n", has_sse41);
+ int has_sse42 = TestCpuFlag(kCpuHasSSE42);
+ printf("Has SSE4.2 %x\n", has_sse42);
+ int has_avx = TestCpuFlag(kCpuHasAVX);
+ printf("Has AVX %x\n", has_avx);
+ int has_avx2 = TestCpuFlag(kCpuHasAVX2);
+ printf("Has AVX2 %x\n", has_avx2);
+}
+
+#if defined(__i386__) || defined(__x86_64__) || \
+ defined(_M_IX86) || defined(_M_X64)
+TEST_F(libyuvTest, TestCpuId) {
+ int has_x86 = TestCpuFlag(kCpuHasX86);
+ if (has_x86) {
+ int cpu_info[4];
+ // Vendor ID:
+ // AuthenticAMD AMD processor
+ // CentaurHauls Centaur processor
+ // CyrixInstead Cyrix processor
+ // GenuineIntel Intel processor
+ // GenuineTMx86 Transmeta processor
+ // Geode by NSC National Semiconductor processor
+ // NexGenDriven NexGen processor
+ // RiseRiseRise Rise Technology processor
+ // SiS SiS SiS SiS processor
+ // UMC UMC UMC UMC processor
+ CpuId(cpu_info, 0);
+ cpu_info[0] = cpu_info[1]; // Reorder output
+ cpu_info[1] = cpu_info[3];
+ cpu_info[3] = 0;
+ printf("Cpu Vendor: %s %x %x %x\n", reinterpret_cast<char*>(&cpu_info[0]),
+ cpu_info[0], cpu_info[1], cpu_info[2]);
+ EXPECT_EQ(12, strlen(reinterpret_cast<char*>(&cpu_info[0])));
+
+ // CPU Family and Model
+ // 3:0 - Stepping
+ // 7:4 - Model
+ // 11:8 - Family
+ // 13:12 - Processor Type
+ // 19:16 - Extended Model
+ // 27:20 - Extended Family
+ CpuId(cpu_info, 1);
+ int family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0);
+ int model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0);
+ printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family,
+ model, model);
+ }
+}
+#endif
+
+TEST_F(libyuvTest, TestLinuxNeon) {
+ int testdata = ArmCpuCaps("unit_test/testdata/arm_v7.txt");
+ if (testdata) {
+ EXPECT_EQ(kCpuInitialized,
+ ArmCpuCaps("unit_test/testdata/arm_v7.txt"));
+ EXPECT_EQ((kCpuInitialized | kCpuHasNEON),
+ ArmCpuCaps("unit_test/testdata/tegra3.txt"));
+ } else {
+ printf("WARNING: unable to load \"unit_test/testdata/arm_v7.txt\"\n");
+ }
+#if defined(__linux__) && defined(__ARM_NEON__)
+ EXPECT_NE(0, ArmCpuCaps("/proc/cpuinfo"));
+#endif
+}
+
+} // namespace libyuv
diff --git a/files/unit_test/planar_test.cc b/files/unit_test/planar_test.cc
new file mode 100644
index 00000000..e9053a35
--- /dev/null
+++ b/files/unit_test/planar_test.cc
@@ -0,0 +1,1005 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <time.h>
+
+#include "libyuv/convert_argb.h"
+#include "libyuv/convert_from.h"
+#include "libyuv/compare.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/format_conversion.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "../unit_test/unit_test.h"
+
+#if defined(_MSC_VER)
+#define SIMD_ALIGNED(var) __declspec(align(16)) var
+#else // __GNUC__
+#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
+#endif
+
+namespace libyuv {
+
+#define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, N, NEG) \
+TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N##_OptVsC) { \
+ const int kWidth = 1280; \
+ const int kHeight = 720; \
+ const int kStride = (kWidth * 8 * BPP_B + 7) / 8; \
+ align_buffer_16(src_y, kWidth * kHeight); \
+ align_buffer_16(src_u, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \
+ align_buffer_16(src_v, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \
+ align_buffer_16(dst_argb_c, kStride * kHeight); \
+ align_buffer_16(dst_argb_opt, kStride * kHeight); \
+ srandom(time(NULL)); \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kWidth; ++j) \
+ src_y[(i * kWidth) + j] = (random() & 0xff); \
+ for (int i = 0; i < kHeight / SUBSAMP_Y; ++i) \
+ for (int j = 0; j < kWidth / SUBSAMP_X; ++j) { \
+ src_u[(i * kWidth / SUBSAMP_X) + j] = (random() & 0xff); \
+ src_v[(i * kWidth / SUBSAMP_X) + j] = (random() & 0xff); \
+ } \
+ MaskCpuFlags(kCpuInitialized); \
+ FMT_PLANAR##To##FMT_B(src_y, kWidth, \
+ src_u, kWidth / SUBSAMP_X, \
+ src_v, kWidth / SUBSAMP_X, \
+ dst_argb_c, kStride, \
+ kWidth, NEG kHeight); \
+ MaskCpuFlags(-1); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_PLANAR##To##FMT_B(src_y, kWidth, \
+ src_u, kWidth / SUBSAMP_X, \
+ src_v, kWidth / SUBSAMP_X, \
+ dst_argb_opt, kStride, \
+ kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth * BPP_B; ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_argb_c[i * kWidth * BPP_B + j]) - \
+ static_cast<int>(dst_argb_opt[i * kWidth * BPP_B + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 2); \
+ free_aligned_buffer_16(src_y) \
+ free_aligned_buffer_16(src_u) \
+ free_aligned_buffer_16(src_v) \
+ free_aligned_buffer_16(dst_argb_c) \
+ free_aligned_buffer_16(dst_argb_opt) \
+}
+
+#define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B) \
+ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, , +) \
+ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, Invert, -)
+
+TESTPLANARTOB(I420, 2, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, BGRA, 4)
+TESTPLANARTOB(I420, 2, 2, ABGR, 4)
+TESTPLANARTOB(I420, 2, 2, RGBA, 4)
+TESTPLANARTOB(I420, 2, 2, RAW, 3)
+TESTPLANARTOB(I420, 2, 2, RGB24, 3)
+TESTPLANARTOB(I420, 2, 2, RGB565, 2)
+TESTPLANARTOB(I420, 2, 2, ARGB1555, 2)
+TESTPLANARTOB(I420, 2, 2, ARGB4444, 2)
+TESTPLANARTOB(I422, 2, 1, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, BGRA, 4)
+TESTPLANARTOB(I422, 2, 1, ABGR, 4)
+TESTPLANARTOB(I422, 2, 1, RGBA, 4)
+TESTPLANARTOB(I411, 4, 1, ARGB, 4)
+TESTPLANARTOB(I444, 1, 1, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, YUY2, 2)
+TESTPLANARTOB(I420, 2, 2, UYVY, 2)
+// TODO(fbarchard): Re-enable test and fix valgrind.
+// TESTPLANARTOB(I420, 2, 2, V210, 16 / 6)
+TESTPLANARTOB(I420, 2, 2, I400, 1)
+TESTPLANARTOB(I420, 2, 2, BayerBGGR, 1)
+TESTPLANARTOB(I420, 2, 2, BayerRGGB, 1)
+TESTPLANARTOB(I420, 2, 2, BayerGBRG, 1)
+TESTPLANARTOB(I420, 2, 2, BayerGRBG, 1)
+
+#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
+ N, NEG) \
+TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N##_OptVsC) { \
+ const int kWidth = 1280; \
+ const int kHeight = 720; \
+ align_buffer_16(src_y, kWidth * kHeight); \
+ align_buffer_16(src_uv, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y * 2); \
+ align_buffer_16(dst_argb_c, (kWidth * BPP_B) * kHeight); \
+ align_buffer_16(dst_argb_opt, (kWidth * BPP_B) * kHeight); \
+ srandom(time(NULL)); \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kWidth; ++j) \
+ src_y[(i * kWidth) + j] = (random() & 0xff); \
+ for (int i = 0; i < kHeight / SUBSAMP_Y; ++i) \
+ for (int j = 0; j < kWidth / SUBSAMP_X * 2; ++j) { \
+ src_uv[(i * kWidth / SUBSAMP_X) * 2 + j] = (random() & 0xff); \
+ } \
+ MaskCpuFlags(kCpuInitialized); \
+ FMT_PLANAR##To##FMT_B(src_y, kWidth, \
+ src_uv, kWidth / SUBSAMP_X * 2, \
+ dst_argb_c, kWidth * BPP_B, \
+ kWidth, NEG kHeight); \
+ MaskCpuFlags(-1); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_PLANAR##To##FMT_B(src_y, kWidth, \
+ src_uv, kWidth / SUBSAMP_X * 2, \
+ dst_argb_opt, kWidth * BPP_B, \
+ kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth * BPP_B; ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_argb_c[i * kWidth * BPP_B + j]) - \
+ static_cast<int>(dst_argb_opt[i * kWidth * BPP_B + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 3); \
+ free_aligned_buffer_16(src_y) \
+ free_aligned_buffer_16(src_uv) \
+ free_aligned_buffer_16(dst_argb_c) \
+ free_aligned_buffer_16(dst_argb_opt) \
+}
+
+#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B) \
+ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, , +) \
+ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, Invert, -)
+
+TESTBIPLANARTOB(NV12, 2, 2, ARGB, 4)
+TESTBIPLANARTOB(NV21, 2, 2, ARGB, 4)
+TESTBIPLANARTOB(NV12, 2, 2, RGB565, 2)
+TESTBIPLANARTOB(NV21, 2, 2, RGB565, 2)
+
+#define TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, N, NEG) \
+TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N##_OptVsC) { \
+ const int kWidth = 1280; \
+ const int kHeight = 720; \
+ const int kStride = (kWidth * 8 * BPP_A + 7) / 8; \
+ align_buffer_16(src_argb, kStride * kHeight); \
+ align_buffer_16(dst_y_c, kWidth * kHeight); \
+ align_buffer_16(dst_u_c, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \
+ align_buffer_16(dst_v_c, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \
+ align_buffer_16(dst_y_opt, kWidth * kHeight); \
+ align_buffer_16(dst_u_opt, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \
+ align_buffer_16(dst_v_opt, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \
+ srandom(time(NULL)); \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kStride; ++j) \
+ src_argb[(i * kStride) + j] = (random() & 0xff); \
+ MaskCpuFlags(kCpuInitialized); \
+ FMT_A##To##FMT_PLANAR(src_argb, kStride, \
+ dst_y_c, kWidth, \
+ dst_u_c, kWidth / SUBSAMP_X, \
+ dst_v_c, kWidth / SUBSAMP_X, \
+ kWidth, NEG kHeight); \
+ MaskCpuFlags(-1); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_A##To##FMT_PLANAR(src_argb, kStride, \
+ dst_y_opt, kWidth, \
+ dst_u_opt, kWidth / SUBSAMP_X, \
+ dst_v_opt, kWidth / SUBSAMP_X, \
+ kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth; ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
+ static_cast<int>(dst_y_opt[i * kWidth + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 2); \
+ for (int i = 0; i < kHeight / SUBSAMP_Y; ++i) { \
+ for (int j = 0; j < kWidth / SUBSAMP_X; ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_u_c[i * kWidth / SUBSAMP_X + j]) - \
+ static_cast<int>(dst_u_opt[i * kWidth / SUBSAMP_X + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 2); \
+ for (int i = 0; i < kHeight / SUBSAMP_Y; ++i) { \
+ for (int j = 0; j < kWidth / SUBSAMP_X; ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_v_c[i * kWidth / SUBSAMP_X + j]) - \
+ static_cast<int>(dst_v_opt[i * kWidth / SUBSAMP_X + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 2); \
+ free_aligned_buffer_16(dst_y_c) \
+ free_aligned_buffer_16(dst_u_c) \
+ free_aligned_buffer_16(dst_v_c) \
+ free_aligned_buffer_16(dst_y_opt) \
+ free_aligned_buffer_16(dst_u_opt) \
+ free_aligned_buffer_16(dst_v_opt) \
+ free_aligned_buffer_16(src_argb) \
+}
+
+#define TESTATOPLANAR(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+ TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, , +) \
+ TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, Invert, -)
+
+TESTATOPLANAR(ARGB, 4, I420, 2, 2)
+TESTATOPLANAR(BGRA, 4, I420, 2, 2)
+TESTATOPLANAR(ABGR, 4, I420, 2, 2)
+TESTATOPLANAR(RGBA, 4, I420, 2, 2)
+TESTATOPLANAR(RAW, 3, I420, 2, 2)
+TESTATOPLANAR(RGB24, 3, I420, 2, 2)
+TESTATOPLANAR(RGB565, 2, I420, 2, 2)
+TESTATOPLANAR(ARGB1555, 2, I420, 2, 2)
+TESTATOPLANAR(ARGB4444, 2, I420, 2, 2)
+// TESTATOPLANAR(ARGB, 4, I411, 4, 1)
+TESTATOPLANAR(ARGB, 4, I422, 2, 1)
+// TESTATOPLANAR(ARGB, 4, I444, 1, 1)
+// TODO(fbarchard): Implement and test 411 and 444
+TESTATOPLANAR(YUY2, 2, I420, 2, 2)
+TESTATOPLANAR(UYVY, 2, I420, 2, 2)
+TESTATOPLANAR(YUY2, 2, I422, 2, 1)
+TESTATOPLANAR(UYVY, 2, I422, 2, 1)
+TESTATOPLANAR(V210, 16 / 6, I420, 2, 2)
+TESTATOPLANAR(I400, 1, I420, 2, 2)
+TESTATOPLANAR(BayerBGGR, 1, I420, 2, 2)
+TESTATOPLANAR(BayerRGGB, 1, I420, 2, 2)
+TESTATOPLANAR(BayerGBRG, 1, I420, 2, 2)
+TESTATOPLANAR(BayerGRBG, 1, I420, 2, 2)
+
+#define TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, N, NEG) \
+TEST_F(libyuvTest, FMT_A##To##FMT_B##N##_OptVsC) { \
+ const int kWidth = 1280; \
+ const int kHeight = 720; \
+ align_buffer_16(src_argb, (kWidth * BPP_A) * kHeight); \
+ align_buffer_16(dst_argb_c, (kWidth * BPP_B) * kHeight); \
+ align_buffer_16(dst_argb_opt, (kWidth * BPP_B) * kHeight); \
+ srandom(time(NULL)); \
+ for (int i = 0; i < kHeight * kWidth * BPP_A; ++i) { \
+ src_argb[i] = (random() & 0xff); \
+ } \
+ MaskCpuFlags(kCpuInitialized); \
+ FMT_A##To##FMT_B(src_argb, kWidth * STRIDE_A, \
+ dst_argb_c, kWidth * BPP_B, \
+ kWidth, NEG kHeight); \
+ MaskCpuFlags(-1); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_A##To##FMT_B(src_argb, kWidth * STRIDE_A, \
+ dst_argb_opt, kWidth * BPP_B, \
+ kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ for (int i = 0; i < kHeight * kWidth * BPP_B; ++i) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_argb_c[i]) - \
+ static_cast<int>(dst_argb_opt[i])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ EXPECT_LE(max_diff, 2); \
+ free_aligned_buffer_16(src_argb) \
+ free_aligned_buffer_16(dst_argb_c) \
+ free_aligned_buffer_16(dst_argb_opt) \
+}
+#define TESTATOB(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B) \
+ TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, , +) \
+ TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, Invert, -)
+
+TESTATOB(I400, 1, 1, I400, 1)
+TESTATOB(ARGB, 4, 4, ARGB, 4)
+TESTATOB(ARGB, 4, 4, BGRA, 4)
+TESTATOB(ARGB, 4, 4, ABGR, 4)
+TESTATOB(ARGB, 4, 4, RGBA, 4)
+TESTATOB(ARGB, 4, 4, RAW, 3)
+TESTATOB(ARGB, 4, 4, RGB24, 3)
+TESTATOB(ARGB, 4, 4, RGB565, 2)
+TESTATOB(ARGB, 4, 4, ARGB1555, 2)
+TESTATOB(ARGB, 4, 4, ARGB4444, 2)
+TESTATOB(BGRA, 4, 4, ARGB, 4)
+TESTATOB(ABGR, 4, 4, ARGB, 4)
+TESTATOB(RGBA, 4, 4, ARGB, 4)
+TESTATOB(RAW, 3, 3, ARGB, 4)
+TESTATOB(RGB24, 3, 3, ARGB, 4)
+TESTATOB(RGB565, 2, 2, ARGB, 4)
+TESTATOB(ARGB1555, 2, 2, ARGB, 4)
+TESTATOB(ARGB4444, 2, 2, ARGB, 4)
+TESTATOB(YUY2, 2, 2, ARGB, 4)
+TESTATOB(UYVY, 2, 2, ARGB, 4)
+TESTATOB(M420, 3 / 2, 1, ARGB, 4)
+
+static const int kReadPad = 16; // Allow overread of 16 bytes.
+#define TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B) \
+TEST_F(libyuvTest, FMT_A##To##FMT_B##_Random) { \
+ srandom(time(NULL)); \
+ for (int times = 0; times < benchmark_iterations_; ++times) { \
+ const int kWidth = (random() & 63) + 1; \
+ const int kHeight = (random() & 31) + 1; \
+ align_buffer_page_end(src_argb, (kWidth * BPP_A) * kHeight + kReadPad); \
+ align_buffer_page_end(dst_argb_c, (kWidth * BPP_B) * kHeight); \
+ align_buffer_page_end(dst_argb_opt, (kWidth * BPP_B) * kHeight); \
+ for (int i = 0; i < kHeight * kWidth * BPP_A; ++i) { \
+ src_argb[i] = (random() & 0xff); \
+ } \
+ MaskCpuFlags(kCpuInitialized); \
+ FMT_A##To##FMT_B(src_argb, kWidth * STRIDE_A, \
+ dst_argb_c, kWidth * BPP_B, \
+ kWidth, kHeight); \
+ MaskCpuFlags(-1); \
+ FMT_A##To##FMT_B(src_argb, kWidth * STRIDE_A, \
+ dst_argb_opt, kWidth * BPP_B, \
+ kWidth, kHeight); \
+ int max_diff = 0; \
+ for (int i = 0; i < kHeight * kWidth * BPP_B; ++i) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_argb_c[i]) - \
+ static_cast<int>(dst_argb_opt[i])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ EXPECT_LE(max_diff, 2); \
+ free_aligned_buffer_page_end(src_argb) \
+ free_aligned_buffer_page_end(dst_argb_c) \
+ free_aligned_buffer_page_end(dst_argb_opt) \
+ } \
+}
+
+TESTATOBRANDOM(ARGB, 4, 4, ARGB, 4)
+TESTATOBRANDOM(ARGB, 4, 4, BGRA, 4)
+TESTATOBRANDOM(ARGB, 4, 4, ABGR, 4)
+TESTATOBRANDOM(ARGB, 4, 4, RGBA, 4)
+TESTATOBRANDOM(ARGB, 4, 4, RAW, 3)
+TESTATOBRANDOM(ARGB, 4, 4, RGB24, 3)
+TESTATOBRANDOM(ARGB, 4, 4, RGB565, 2)
+TESTATOBRANDOM(ARGB, 4, 4, ARGB1555, 2)
+TESTATOBRANDOM(ARGB, 4, 4, ARGB4444, 2)
+
+TESTATOBRANDOM(BGRA, 4, 4, ARGB, 4)
+TESTATOBRANDOM(ABGR, 4, 4, ARGB, 4)
+TESTATOBRANDOM(RGBA, 4, 4, ARGB, 4)
+TESTATOBRANDOM(RAW, 3, 3, ARGB, 4)
+TESTATOBRANDOM(RGB24, 3, 3, ARGB, 4)
+TESTATOBRANDOM(RGB565, 2, 2, ARGB, 4)
+TESTATOBRANDOM(ARGB1555, 2, 2, ARGB, 4)
+TESTATOBRANDOM(ARGB4444, 2, 2, ARGB, 4)
+
+TEST_F(libyuvTest, TestAttenuate) {
+ SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+ SIMD_ALIGNED(uint8 atten_pixels[256][4]);
+ SIMD_ALIGNED(uint8 unatten_pixels[256][4]);
+ SIMD_ALIGNED(uint8 atten2_pixels[256][4]);
+
+ // Test unattenuation clamps
+ orig_pixels[0][0] = 200u;
+ orig_pixels[0][1] = 129u;
+ orig_pixels[0][2] = 127u;
+ orig_pixels[0][3] = 128u;
+ // Test unattenuation transparent and opaque are unaffected
+ orig_pixels[1][0] = 16u;
+ orig_pixels[1][1] = 64u;
+ orig_pixels[1][2] = 192u;
+ orig_pixels[1][3] = 0u;
+ orig_pixels[2][0] = 16u;
+ orig_pixels[2][1] = 64u;
+ orig_pixels[2][2] = 192u;
+ orig_pixels[2][3] = 255u;
+ orig_pixels[3][0] = 16u;
+ orig_pixels[3][1] = 64u;
+ orig_pixels[3][2] = 192u;
+ orig_pixels[3][3] = 128u;
+ ARGBUnattenuate(&orig_pixels[0][0], 0, &unatten_pixels[0][0], 0, 4, 1);
+ EXPECT_EQ(255u, unatten_pixels[0][0]);
+ EXPECT_EQ(255u, unatten_pixels[0][1]);
+ EXPECT_EQ(254u, unatten_pixels[0][2]);
+ EXPECT_EQ(128u, unatten_pixels[0][3]);
+ EXPECT_EQ(16u, unatten_pixels[1][0]);
+ EXPECT_EQ(64u, unatten_pixels[1][1]);
+ EXPECT_EQ(192u, unatten_pixels[1][2]);
+ EXPECT_EQ(0u, unatten_pixels[1][3]);
+ EXPECT_EQ(16u, unatten_pixels[2][0]);
+ EXPECT_EQ(64u, unatten_pixels[2][1]);
+ EXPECT_EQ(192u, unatten_pixels[2][2]);
+ EXPECT_EQ(255u, unatten_pixels[2][3]);
+ EXPECT_EQ(32u, unatten_pixels[3][0]);
+ EXPECT_EQ(128u, unatten_pixels[3][1]);
+ EXPECT_EQ(255u, unatten_pixels[3][2]);
+ EXPECT_EQ(128u, unatten_pixels[3][3]);
+
+ for (int i = 0; i < 256; ++i) {
+ orig_pixels[i][0] = i;
+ orig_pixels[i][1] = i / 2;
+ orig_pixels[i][2] = i / 3;
+ orig_pixels[i][3] = i;
+ }
+ ARGBAttenuate(&orig_pixels[0][0], 0, &atten_pixels[0][0], 0, 256, 1);
+ ARGBUnattenuate(&atten_pixels[0][0], 0, &unatten_pixels[0][0], 0, 256, 1);
+ for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) {
+ ARGBAttenuate(&unatten_pixels[0][0], 0, &atten2_pixels[0][0], 0, 256, 1);
+ }
+ for (int i = 0; i < 256; ++i) {
+ EXPECT_NEAR(atten_pixels[i][0], atten2_pixels[i][0], 2);
+ EXPECT_NEAR(atten_pixels[i][1], atten2_pixels[i][1], 2);
+ EXPECT_NEAR(atten_pixels[i][2], atten2_pixels[i][2], 2);
+ EXPECT_NEAR(atten_pixels[i][3], atten2_pixels[i][3], 2);
+ }
+ // Make sure transparent, 50% and opaque are fully accurate.
+ EXPECT_EQ(0, atten_pixels[0][0]);
+ EXPECT_EQ(0, atten_pixels[0][1]);
+ EXPECT_EQ(0, atten_pixels[0][2]);
+ EXPECT_EQ(0, atten_pixels[0][3]);
+ EXPECT_EQ(64, atten_pixels[128][0]);
+ EXPECT_EQ(32, atten_pixels[128][1]);
+ EXPECT_EQ(21, atten_pixels[128][2]);
+ EXPECT_EQ(128, atten_pixels[128][3]);
+ EXPECT_EQ(255, atten_pixels[255][0]);
+ EXPECT_EQ(127, atten_pixels[255][1]);
+ EXPECT_EQ(85, atten_pixels[255][2]);
+ EXPECT_EQ(255, atten_pixels[255][3]);
+}
+
+TEST_F(libyuvTest, TestARGBComputeCumulativeSum) {
+ SIMD_ALIGNED(uint8 orig_pixels[16][16][4]);
+ SIMD_ALIGNED(int32 added_pixels[16][16][4]);
+
+ for (int y = 0; y < 16; ++y) {
+ for (int x = 0; x < 16; ++x) {
+ orig_pixels[y][x][0] = 1u;
+ orig_pixels[y][x][1] = 2u;
+ orig_pixels[y][x][2] = 3u;
+ orig_pixels[y][x][3] = 255u;
+ }
+ }
+
+ ARGBComputeCumulativeSum(&orig_pixels[0][0][0], 16 * 4,
+ &added_pixels[0][0][0], 16 * 4,
+ 16, 16);
+
+ for (int y = 0; y < 16; ++y) {
+ for (int x = 0; x < 16; ++x) {
+ EXPECT_EQ((x + 1) * (y + 1), added_pixels[y][x][0]);
+ EXPECT_EQ((x + 1) * (y + 1) * 2, added_pixels[y][x][1]);
+ EXPECT_EQ((x + 1) * (y + 1) * 3, added_pixels[y][x][2]);
+ EXPECT_EQ((x + 1) * (y + 1) * 255, added_pixels[y][x][3]);
+ }
+ }
+}
+
+TEST_F(libyuvTest, TestARGBGray) {
+ SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+
+ // Test blue
+ orig_pixels[0][0] = 255u;
+ orig_pixels[0][1] = 0u;
+ orig_pixels[0][2] = 0u;
+ orig_pixels[0][3] = 128u;
+ // Test green
+ orig_pixels[1][0] = 0u;
+ orig_pixels[1][1] = 255u;
+ orig_pixels[1][2] = 0u;
+ orig_pixels[1][3] = 0u;
+ // Test red
+ orig_pixels[2][0] = 0u;
+ orig_pixels[2][1] = 0u;
+ orig_pixels[2][2] = 255u;
+ orig_pixels[2][3] = 255u;
+ // Test color
+ orig_pixels[3][0] = 16u;
+ orig_pixels[3][1] = 64u;
+ orig_pixels[3][2] = 192u;
+ orig_pixels[3][3] = 224u;
+ // Do 16 to test asm version.
+ ARGBGray(&orig_pixels[0][0], 0, 0, 0, 16, 1);
+ EXPECT_EQ(27u, orig_pixels[0][0]);
+ EXPECT_EQ(27u, orig_pixels[0][1]);
+ EXPECT_EQ(27u, orig_pixels[0][2]);
+ EXPECT_EQ(128u, orig_pixels[0][3]);
+ EXPECT_EQ(151u, orig_pixels[1][0]);
+ EXPECT_EQ(151u, orig_pixels[1][1]);
+ EXPECT_EQ(151u, orig_pixels[1][2]);
+ EXPECT_EQ(0u, orig_pixels[1][3]);
+ EXPECT_EQ(75u, orig_pixels[2][0]);
+ EXPECT_EQ(75u, orig_pixels[2][1]);
+ EXPECT_EQ(75u, orig_pixels[2][2]);
+ EXPECT_EQ(255u, orig_pixels[2][3]);
+ EXPECT_EQ(96u, orig_pixels[3][0]);
+ EXPECT_EQ(96u, orig_pixels[3][1]);
+ EXPECT_EQ(96u, orig_pixels[3][2]);
+ EXPECT_EQ(224u, orig_pixels[3][3]);
+
+ for (int i = 0; i < 256; ++i) {
+ orig_pixels[i][0] = i;
+ orig_pixels[i][1] = i / 2;
+ orig_pixels[i][2] = i / 3;
+ orig_pixels[i][3] = i;
+ }
+
+ for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) {
+ ARGBGray(&orig_pixels[0][0], 0, 0, 0, 256, 1);
+ }
+}
+
+TEST_F(libyuvTest, TestARGBGrayTo) {
+ SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+ SIMD_ALIGNED(uint8 gray_pixels[256][4]);
+
+ // Test blue
+ orig_pixels[0][0] = 255u;
+ orig_pixels[0][1] = 0u;
+ orig_pixels[0][2] = 0u;
+ orig_pixels[0][3] = 128u;
+ // Test green
+ orig_pixels[1][0] = 0u;
+ orig_pixels[1][1] = 255u;
+ orig_pixels[1][2] = 0u;
+ orig_pixels[1][3] = 0u;
+ // Test red
+ orig_pixels[2][0] = 0u;
+ orig_pixels[2][1] = 0u;
+ orig_pixels[2][2] = 255u;
+ orig_pixels[2][3] = 255u;
+ // Test color
+ orig_pixels[3][0] = 16u;
+ orig_pixels[3][1] = 64u;
+ orig_pixels[3][2] = 192u;
+ orig_pixels[3][3] = 224u;
+ // Do 16 to test asm version.
+ ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 16, 1);
+ EXPECT_EQ(27u, gray_pixels[0][0]);
+ EXPECT_EQ(27u, gray_pixels[0][1]);
+ EXPECT_EQ(27u, gray_pixels[0][2]);
+ EXPECT_EQ(128u, gray_pixels[0][3]);
+ EXPECT_EQ(151u, gray_pixels[1][0]);
+ EXPECT_EQ(151u, gray_pixels[1][1]);
+ EXPECT_EQ(151u, gray_pixels[1][2]);
+ EXPECT_EQ(0u, gray_pixels[1][3]);
+ EXPECT_EQ(75u, gray_pixels[2][0]);
+ EXPECT_EQ(75u, gray_pixels[2][1]);
+ EXPECT_EQ(75u, gray_pixels[2][2]);
+ EXPECT_EQ(255u, gray_pixels[2][3]);
+ EXPECT_EQ(96u, gray_pixels[3][0]);
+ EXPECT_EQ(96u, gray_pixels[3][1]);
+ EXPECT_EQ(96u, gray_pixels[3][2]);
+ EXPECT_EQ(224u, gray_pixels[3][3]);
+
+ for (int i = 0; i < 256; ++i) {
+ orig_pixels[i][0] = i;
+ orig_pixels[i][1] = i / 2;
+ orig_pixels[i][2] = i / 3;
+ orig_pixels[i][3] = i;
+ }
+
+ for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) {
+ ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 256, 1);
+ }
+}
+
+TEST_F(libyuvTest, TestARGBSepia) {
+ SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+
+ // Test blue
+ orig_pixels[0][0] = 255u;
+ orig_pixels[0][1] = 0u;
+ orig_pixels[0][2] = 0u;
+ orig_pixels[0][3] = 128u;
+ // Test green
+ orig_pixels[1][0] = 0u;
+ orig_pixels[1][1] = 255u;
+ orig_pixels[1][2] = 0u;
+ orig_pixels[1][3] = 0u;
+ // Test red
+ orig_pixels[2][0] = 0u;
+ orig_pixels[2][1] = 0u;
+ orig_pixels[2][2] = 255u;
+ orig_pixels[2][3] = 255u;
+ // Test color
+ orig_pixels[3][0] = 16u;
+ orig_pixels[3][1] = 64u;
+ orig_pixels[3][2] = 192u;
+ orig_pixels[3][3] = 224u;
+ // Do 16 to test asm version.
+ ARGBSepia(&orig_pixels[0][0], 0, 0, 0, 16, 1);
+ EXPECT_EQ(33u, orig_pixels[0][0]);
+ EXPECT_EQ(43u, orig_pixels[0][1]);
+ EXPECT_EQ(47u, orig_pixels[0][2]);
+ EXPECT_EQ(128u, orig_pixels[0][3]);
+ EXPECT_EQ(135u, orig_pixels[1][0]);
+ EXPECT_EQ(175u, orig_pixels[1][1]);
+ EXPECT_EQ(195u, orig_pixels[1][2]);
+ EXPECT_EQ(0u, orig_pixels[1][3]);
+ EXPECT_EQ(69u, orig_pixels[2][0]);
+ EXPECT_EQ(89u, orig_pixels[2][1]);
+ EXPECT_EQ(99u, orig_pixels[2][2]);
+ EXPECT_EQ(255u, orig_pixels[2][3]);
+ EXPECT_EQ(88u, orig_pixels[3][0]);
+ EXPECT_EQ(114u, orig_pixels[3][1]);
+ EXPECT_EQ(127u, orig_pixels[3][2]);
+ EXPECT_EQ(224u, orig_pixels[3][3]);
+
+ for (int i = 0; i < 256; ++i) {
+ orig_pixels[i][0] = i;
+ orig_pixels[i][1] = i / 2;
+ orig_pixels[i][2] = i / 3;
+ orig_pixels[i][3] = i;
+ }
+
+ for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) {
+ ARGBSepia(&orig_pixels[0][0], 0, 0, 0, 256, 1);
+ }
+}
+
+TEST_F(libyuvTest, TestARGBColorMatrix) {
+ SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+
+ // Matrix for Sepia.
+ static const int8 kARGBToSepia[] = {
+ 17, 68, 35, 0,
+ 22, 88, 45, 0,
+ 24, 98, 50, 0,
+ };
+
+ // Test blue
+ orig_pixels[0][0] = 255u;
+ orig_pixels[0][1] = 0u;
+ orig_pixels[0][2] = 0u;
+ orig_pixels[0][3] = 128u;
+ // Test green
+ orig_pixels[1][0] = 0u;
+ orig_pixels[1][1] = 255u;
+ orig_pixels[1][2] = 0u;
+ orig_pixels[1][3] = 0u;
+ // Test red
+ orig_pixels[2][0] = 0u;
+ orig_pixels[2][1] = 0u;
+ orig_pixels[2][2] = 255u;
+ orig_pixels[2][3] = 255u;
+ // Test color
+ orig_pixels[3][0] = 16u;
+ orig_pixels[3][1] = 64u;
+ orig_pixels[3][2] = 192u;
+ orig_pixels[3][3] = 224u;
+ // Do 16 to test asm version.
+ ARGBColorMatrix(&orig_pixels[0][0], 0, &kARGBToSepia[0], 0, 0, 16, 1);
+ EXPECT_EQ(33u, orig_pixels[0][0]);
+ EXPECT_EQ(43u, orig_pixels[0][1]);
+ EXPECT_EQ(47u, orig_pixels[0][2]);
+ EXPECT_EQ(128u, orig_pixels[0][3]);
+ EXPECT_EQ(135u, orig_pixels[1][0]);
+ EXPECT_EQ(175u, orig_pixels[1][1]);
+ EXPECT_EQ(195u, orig_pixels[1][2]);
+ EXPECT_EQ(0u, orig_pixels[1][3]);
+ EXPECT_EQ(69u, orig_pixels[2][0]);
+ EXPECT_EQ(89u, orig_pixels[2][1]);
+ EXPECT_EQ(99u, orig_pixels[2][2]);
+ EXPECT_EQ(255u, orig_pixels[2][3]);
+ EXPECT_EQ(88u, orig_pixels[3][0]);
+ EXPECT_EQ(114u, orig_pixels[3][1]);
+ EXPECT_EQ(127u, orig_pixels[3][2]);
+ EXPECT_EQ(224u, orig_pixels[3][3]);
+
+ for (int i = 0; i < 256; ++i) {
+ orig_pixels[i][0] = i;
+ orig_pixels[i][1] = i / 2;
+ orig_pixels[i][2] = i / 3;
+ orig_pixels[i][3] = i;
+ }
+
+ for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) {
+ ARGBColorMatrix(&orig_pixels[0][0], 0, &kARGBToSepia[0], 0, 0, 256, 1);
+ }
+}
+
+TEST_F(libyuvTest, TestARGBColorTable) {
+ SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+ memset(orig_pixels, 0, sizeof(orig_pixels));
+
+ // Matrix for Sepia.
+ static const uint8 kARGBTable[256 * 4] = {
+ 1u, 2u, 3u, 4u,
+ 5u, 6u, 7u, 8u,
+ 9u, 10u, 11u, 12u,
+ 13u, 14u, 15u, 16u,
+ };
+
+ orig_pixels[0][0] = 0u;
+ orig_pixels[0][1] = 0u;
+ orig_pixels[0][2] = 0u;
+ orig_pixels[0][3] = 0u;
+ orig_pixels[1][0] = 1u;
+ orig_pixels[1][1] = 1u;
+ orig_pixels[1][2] = 1u;
+ orig_pixels[1][3] = 1u;
+ orig_pixels[2][0] = 2u;
+ orig_pixels[2][1] = 2u;
+ orig_pixels[2][2] = 2u;
+ orig_pixels[2][3] = 2u;
+ orig_pixels[3][0] = 0u;
+ orig_pixels[3][1] = 1u;
+ orig_pixels[3][2] = 2u;
+ orig_pixels[3][3] = 3u;
+ // Do 16 to test asm version.
+ ARGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 16, 1);
+ EXPECT_EQ(1u, orig_pixels[0][0]);
+ EXPECT_EQ(2u, orig_pixels[0][1]);
+ EXPECT_EQ(3u, orig_pixels[0][2]);
+ EXPECT_EQ(4u, orig_pixels[0][3]);
+ EXPECT_EQ(5u, orig_pixels[1][0]);
+ EXPECT_EQ(6u, orig_pixels[1][1]);
+ EXPECT_EQ(7u, orig_pixels[1][2]);
+ EXPECT_EQ(8u, orig_pixels[1][3]);
+ EXPECT_EQ(9u, orig_pixels[2][0]);
+ EXPECT_EQ(10u, orig_pixels[2][1]);
+ EXPECT_EQ(11u, orig_pixels[2][2]);
+ EXPECT_EQ(12u, orig_pixels[2][3]);
+ EXPECT_EQ(1u, orig_pixels[3][0]);
+ EXPECT_EQ(6u, orig_pixels[3][1]);
+ EXPECT_EQ(11u, orig_pixels[3][2]);
+ EXPECT_EQ(16u, orig_pixels[3][3]);
+
+ for (int i = 0; i < 256; ++i) {
+ orig_pixels[i][0] = i;
+ orig_pixels[i][1] = i / 2;
+ orig_pixels[i][2] = i / 3;
+ orig_pixels[i][3] = i;
+ }
+
+ for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) {
+ ARGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 256, 1);
+ }
+}
+
+TEST_F(libyuvTest, TestARGBQuantize) {
+ SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+
+ for (int i = 0; i < 256; ++i) {
+ orig_pixels[i][0] = i;
+ orig_pixels[i][1] = i / 2;
+ orig_pixels[i][2] = i / 3;
+ orig_pixels[i][3] = i;
+ }
+ ARGBQuantize(&orig_pixels[0][0], 0,
+ (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0, 256, 1);
+
+ for (int i = 0; i < 256; ++i) {
+ EXPECT_EQ(i / 8 * 8 + 8 / 2, orig_pixels[i][0]);
+ EXPECT_EQ(i / 2 / 8 * 8 + 8 / 2, orig_pixels[i][1]);
+ EXPECT_EQ(i / 3 / 8 * 8 + 8 / 2, orig_pixels[i][2]);
+ EXPECT_EQ(i, orig_pixels[i][3]);
+ }
+ for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) {
+ ARGBQuantize(&orig_pixels[0][0], 0,
+ (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0, 256, 1);
+ }
+}
+
+TEST_F(libyuvTest, TestARGBMirror) {
+ SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+ SIMD_ALIGNED(uint8 dst_pixels[256][4]);
+
+ for (int i = 0; i < 256; ++i) {
+ orig_pixels[i][0] = i;
+ orig_pixels[i][1] = i / 2;
+ orig_pixels[i][2] = i / 3;
+ orig_pixels[i][3] = i / 4;
+ }
+ ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 256, 1);
+
+ for (int i = 0; i < 256; ++i) {
+ EXPECT_EQ(i, dst_pixels[255 - i][0]);
+ EXPECT_EQ(i / 2, dst_pixels[255 - i][1]);
+ EXPECT_EQ(i / 3, dst_pixels[255 - i][2]);
+ EXPECT_EQ(i / 4, dst_pixels[255 - i][3]);
+ }
+ for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) {
+ ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 256, 1);
+ }
+}
+
+TEST_F(libyuvTest, TestShade) {
+ SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+ SIMD_ALIGNED(uint8 shade_pixels[256][4]);
+
+ orig_pixels[0][0] = 10u;
+ orig_pixels[0][1] = 20u;
+ orig_pixels[0][2] = 40u;
+ orig_pixels[0][3] = 80u;
+ orig_pixels[1][0] = 0u;
+ orig_pixels[1][1] = 0u;
+ orig_pixels[1][2] = 0u;
+ orig_pixels[1][3] = 255u;
+ orig_pixels[2][0] = 0u;
+ orig_pixels[2][1] = 0u;
+ orig_pixels[2][2] = 0u;
+ orig_pixels[2][3] = 0u;
+ orig_pixels[3][0] = 0u;
+ orig_pixels[3][1] = 0u;
+ orig_pixels[3][2] = 0u;
+ orig_pixels[3][3] = 0u;
+ ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 4, 1, 0x80ffffff);
+ EXPECT_EQ(10u, shade_pixels[0][0]);
+ EXPECT_EQ(20u, shade_pixels[0][1]);
+ EXPECT_EQ(40u, shade_pixels[0][2]);
+ EXPECT_EQ(40u, shade_pixels[0][3]);
+ EXPECT_EQ(0u, shade_pixels[1][0]);
+ EXPECT_EQ(0u, shade_pixels[1][1]);
+ EXPECT_EQ(0u, shade_pixels[1][2]);
+ EXPECT_EQ(128u, shade_pixels[1][3]);
+ EXPECT_EQ(0u, shade_pixels[2][0]);
+ EXPECT_EQ(0u, shade_pixels[2][1]);
+ EXPECT_EQ(0u, shade_pixels[2][2]);
+ EXPECT_EQ(0u, shade_pixels[2][3]);
+ EXPECT_EQ(0u, shade_pixels[3][0]);
+ EXPECT_EQ(0u, shade_pixels[3][1]);
+ EXPECT_EQ(0u, shade_pixels[3][2]);
+ EXPECT_EQ(0u, shade_pixels[3][3]);
+
+ ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 4, 1, 0x80808080);
+ EXPECT_EQ(5u, shade_pixels[0][0]);
+ EXPECT_EQ(10u, shade_pixels[0][1]);
+ EXPECT_EQ(20u, shade_pixels[0][2]);
+ EXPECT_EQ(40u, shade_pixels[0][3]);
+
+ for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) {
+ ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 256, 1,
+ 0x80808080);
+ }
+}
+
+TEST_F(libyuvTest, TestInterpolate) {
+ SIMD_ALIGNED(uint8 orig_pixels_0[256][4]);
+ SIMD_ALIGNED(uint8 orig_pixels_1[256][4]);
+ SIMD_ALIGNED(uint8 interpolate_pixels[256][4]);
+
+ orig_pixels_0[0][0] = 16u;
+ orig_pixels_0[0][1] = 32u;
+ orig_pixels_0[0][2] = 64u;
+ orig_pixels_0[0][3] = 128u;
+ orig_pixels_0[1][0] = 0u;
+ orig_pixels_0[1][1] = 0u;
+ orig_pixels_0[1][2] = 0u;
+ orig_pixels_0[1][3] = 255u;
+ orig_pixels_0[2][0] = 0u;
+ orig_pixels_0[2][1] = 0u;
+ orig_pixels_0[2][2] = 0u;
+ orig_pixels_0[2][3] = 0u;
+ orig_pixels_0[3][0] = 0u;
+ orig_pixels_0[3][1] = 0u;
+ orig_pixels_0[3][2] = 0u;
+ orig_pixels_0[3][3] = 0u;
+
+ orig_pixels_1[0][0] = 0u;
+ orig_pixels_1[0][1] = 0u;
+ orig_pixels_1[0][2] = 0u;
+ orig_pixels_1[0][3] = 0u;
+ orig_pixels_1[1][0] = 0u;
+ orig_pixels_1[1][1] = 0u;
+ orig_pixels_1[1][2] = 0u;
+ orig_pixels_1[1][3] = 0u;
+ orig_pixels_1[2][0] = 0u;
+ orig_pixels_1[2][1] = 0u;
+ orig_pixels_1[2][2] = 0u;
+ orig_pixels_1[2][3] = 0u;
+ orig_pixels_1[3][0] = 255u;
+ orig_pixels_1[3][1] = 255u;
+ orig_pixels_1[3][2] = 255u;
+ orig_pixels_1[3][3] = 255u;
+
+ ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0,
+ &interpolate_pixels[0][0], 0, 4, 1, 128);
+ EXPECT_EQ(8u, interpolate_pixels[0][0]);
+ EXPECT_EQ(16u, interpolate_pixels[0][1]);
+ EXPECT_EQ(32u, interpolate_pixels[0][2]);
+ EXPECT_EQ(64u, interpolate_pixels[0][3]);
+ EXPECT_EQ(0u, interpolate_pixels[1][0]);
+ EXPECT_EQ(0u, interpolate_pixels[1][1]);
+ EXPECT_EQ(0u, interpolate_pixels[1][2]);
+ EXPECT_NEAR(128u, interpolate_pixels[1][3], 1); // C = 127, SSE = 128.
+ EXPECT_EQ(0u, interpolate_pixels[2][0]);
+ EXPECT_EQ(0u, interpolate_pixels[2][1]);
+ EXPECT_EQ(0u, interpolate_pixels[2][2]);
+ EXPECT_EQ(0u, interpolate_pixels[2][3]);
+ EXPECT_NEAR(128u, interpolate_pixels[3][0], 1);
+ EXPECT_NEAR(128u, interpolate_pixels[3][1], 1);
+ EXPECT_NEAR(128u, interpolate_pixels[3][2], 1);
+ EXPECT_NEAR(128u, interpolate_pixels[3][3], 1);
+
+ ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0,
+ &interpolate_pixels[0][0], 0, 4, 1, 0);
+ EXPECT_EQ(16u, interpolate_pixels[0][0]);
+ EXPECT_EQ(32u, interpolate_pixels[0][1]);
+ EXPECT_EQ(64u, interpolate_pixels[0][2]);
+ EXPECT_EQ(128u, interpolate_pixels[0][3]);
+
+ ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0,
+ &interpolate_pixels[0][0], 0, 4, 1, 192);
+
+ EXPECT_EQ(4u, interpolate_pixels[0][0]);
+ EXPECT_EQ(8u, interpolate_pixels[0][1]);
+ EXPECT_EQ(16u, interpolate_pixels[0][2]);
+ EXPECT_EQ(32u, interpolate_pixels[0][3]);
+
+ for (int i = 0; i < benchmark_iterations_ * (1280 * 720 / 256); ++i) {
+ ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0,
+ &interpolate_pixels[0][0], 0, 256, 1, 128);
+ }
+}
+
+TEST_F(libyuvTest, TestAffine) {
+ SIMD_ALIGNED(uint8 orig_pixels_0[256][4]);
+ SIMD_ALIGNED(uint8 interpolate_pixels_C[256][4]);
+#if defined(HAS_ARGBAFFINEROW_SSE2)
+ SIMD_ALIGNED(uint8 interpolate_pixels_Opt[256][4]);
+#endif
+
+ for (int i = 0; i < 256; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ orig_pixels_0[i][j] = i;
+ }
+ }
+
+ float uv_step[4] = { 0.f, 0.f, 0.75f, 0.f };
+
+ ARGBAffineRow_C(&orig_pixels_0[0][0], 0, &interpolate_pixels_C[0][0],
+ uv_step, 256);
+ EXPECT_EQ(0u, interpolate_pixels_C[0][0]);
+ EXPECT_EQ(96u, interpolate_pixels_C[128][0]);
+ EXPECT_EQ(191u, interpolate_pixels_C[255][3]);
+
+#if defined(HAS_ARGBAFFINEROW_SSE2)
+ ARGBAffineRow_SSE2(&orig_pixels_0[0][0], 0, &interpolate_pixels_Opt[0][0],
+ uv_step, 256);
+ EXPECT_EQ(0, memcmp(interpolate_pixels_Opt, interpolate_pixels_C, 256 * 4));
+#endif
+
+#if defined(HAS_ARGBAFFINEROW_SSE2)
+ int has_sse2 = TestCpuFlag(kCpuHasSSE2);
+ if (has_sse2) {
+ for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) {
+ ARGBAffineRow_SSE2(&orig_pixels_0[0][0], 0, &interpolate_pixels_Opt[0][0],
+ uv_step, 256);
+ }
+ } else {
+#endif
+ for (int i = 0; i < benchmark_iterations_ * 1280 * 720 / 256; ++i) {
+ ARGBAffineRow_C(&orig_pixels_0[0][0], 0, &interpolate_pixels_C[0][0],
+ uv_step, 256);
+ }
+#if defined(HAS_ARGBAFFINEROW_SSE2)
+ }
+#endif
+}
+
+TEST_F(libyuvTest, Test565) {
+ SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+ SIMD_ALIGNED(uint8 pixels565[256][2]);
+
+ for (int i = 0; i < 256; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ orig_pixels[i][j] = i;
+ }
+ }
+ ARGBToRGB565(&orig_pixels[0][0], 0, &pixels565[0][0], 0, 256, 1);
+ uint32 checksum = HashDjb2(&pixels565[0][0], sizeof(pixels565), 5381);
+ EXPECT_EQ(610919429u, checksum);
+}
+
+} // namespace libyuv
diff --git a/files/unit_test/rotate_argb_test.cc b/files/unit_test/rotate_argb_test.cc
new file mode 100644
index 00000000..fe8435e1
--- /dev/null
+++ b/files/unit_test/rotate_argb_test.cc
@@ -0,0 +1,195 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <time.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/rotate_argb.h"
+#include "../unit_test/unit_test.h"
+
+namespace libyuv {
+
+static int ARGBTestRotate(int src_width, int src_height,
+ int dst_width, int dst_height,
+ libyuv::RotationMode mode, int runs) {
+ const int b = 128;
+ int src_argb_plane_size = (src_width + b * 2) * (src_height + b * 2) * 4;
+ int src_stride_argb = (b * 2 + src_width) * 4;
+
+ align_buffer_16(src_argb, src_argb_plane_size)
+ memset(src_argb, 1, src_argb_plane_size);
+
+ int dst_argb_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 4;
+ int dst_stride_argb = (b * 2 + dst_width) * 4;
+
+ srandom(time(NULL));
+
+ int i, j;
+ for (i = b; i < (src_height + b); ++i) {
+ for (j = b; j < (src_width + b) * 4; ++j) {
+ src_argb[(i * src_stride_argb) + j] = (random() & 0xff);
+ }
+ }
+
+ align_buffer_16(dst_argb_c, dst_argb_plane_size)
+ align_buffer_16(dst_argb_opt, dst_argb_plane_size)
+ memset(dst_argb_c, 2, dst_argb_plane_size);
+ memset(dst_argb_opt, 3, dst_argb_plane_size);
+
+ // Warm up both versions for consistent benchmarks.
+ MaskCpuFlags(0); // Disable all CPU optimization.
+ ARGBRotate(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+ dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+ src_width, src_height, mode);
+ MaskCpuFlags(-1); // Enable all CPU optimization.
+ ARGBRotate(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+ dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+ src_width, src_height, mode);
+
+ MaskCpuFlags(0); // Disable all CPU optimization.
+ double c_time = get_time();
+ for (i = 0; i < runs; ++i) {
+ ARGBRotate(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+ dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+ src_width, src_height, mode);
+ }
+ c_time = (get_time() - c_time) / runs;
+
+ MaskCpuFlags(-1); // Enable all CPU optimization.
+ double opt_time = get_time();
+ for (i = 0; i < runs; ++i) {
+ ARGBRotate(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+ dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+ src_width, src_height, mode);
+ }
+ opt_time = (get_time() - opt_time) / runs;
+
+ // Report performance of C vs OPT
+ printf("filter %d - %8d us C - %8d us OPT\n",
+ mode, static_cast<int>(c_time*1e6), static_cast<int>(opt_time*1e6));
+
+ // C version may be a little off from the optimized. Order of
+ // operations may introduce rounding somewhere. So do a difference
+ // of the buffers and look to see that the max difference isn't
+ // over 2.
+ int max_diff = 0;
+ for (i = b; i < (dst_height + b); ++i) {
+ for (j = b * 4; j < (dst_width + b) * 4; ++j) {
+ int abs_diff = abs(dst_argb_c[(i * dst_stride_argb) + j] -
+ dst_argb_opt[(i * dst_stride_argb) + j]);
+ if (abs_diff > max_diff)
+ max_diff = abs_diff;
+ }
+ }
+
+ free_aligned_buffer_16(dst_argb_c)
+ free_aligned_buffer_16(dst_argb_opt)
+ free_aligned_buffer_16(src_argb)
+ return max_diff;
+}
+
+TEST_F(libyuvTest, ARGBRotate0) {
+ const int src_width = 1280;
+ const int src_height = 720;
+ const int dst_width = 1280;
+ const int dst_height = 720;
+
+ int err = ARGBTestRotate(src_width, src_height,
+ dst_width, dst_height, kRotate0,
+ benchmark_iterations_);
+ EXPECT_GE(1, err);
+}
+
+TEST_F(libyuvTest, ARGBRotate90) {
+ const int src_width = 1280;
+ const int src_height = 720;
+ const int dst_width = 720;
+ const int dst_height = 1280;
+
+ int err = ARGBTestRotate(src_width, src_height,
+ dst_width, dst_height, kRotate90,
+ benchmark_iterations_);
+ EXPECT_GE(1, err);
+}
+
+TEST_F(libyuvTest, ARGBRotate180) {
+ const int src_width = 1280;
+ const int src_height = 720;
+ const int dst_width = 1280;
+ const int dst_height = 720;
+
+ int err = ARGBTestRotate(src_width, src_height,
+ dst_width, dst_height, kRotate180,
+ benchmark_iterations_);
+ EXPECT_GE(1, err);
+}
+
+TEST_F(libyuvTest, ARGBRotate270) {
+ const int src_width = 1280;
+ const int src_height = 720;
+ const int dst_width = 720;
+ const int dst_height = 1280;
+
+ int err = ARGBTestRotate(src_width, src_height,
+ dst_width, dst_height, kRotate270,
+ benchmark_iterations_);
+ EXPECT_GE(1, err);
+}
+
+TEST_F(libyuvTest, ARGBRotate0_Odd) {
+ const int src_width = 1277;
+ const int src_height = 719;
+ const int dst_width = 1277;
+ const int dst_height = 719;
+
+ int err = ARGBTestRotate(src_width, src_height,
+ dst_width, dst_height, kRotate0,
+ benchmark_iterations_);
+ EXPECT_GE(1, err);
+}
+
+TEST_F(libyuvTest, ARGBRotate90_Odd) {
+ const int src_width = 1277;
+ const int src_height = 719;
+ const int dst_width = 719;
+ const int dst_height = 1277;
+
+ int err = ARGBTestRotate(src_width, src_height,
+ dst_width, dst_height, kRotate90,
+ benchmark_iterations_);
+ EXPECT_GE(1, err);
+}
+
+TEST_F(libyuvTest, ARGBRotate180_Odd) {
+ const int src_width = 1277;
+ const int src_height = 719;
+ const int dst_width = 1277;
+ const int dst_height = 719;
+
+ int err = ARGBTestRotate(src_width, src_height,
+ dst_width, dst_height, kRotate180,
+ benchmark_iterations_);
+ EXPECT_GE(1, err);
+}
+
+TEST_F(libyuvTest, ARGBRotate270_Odd) {
+ const int src_width = 1277;
+ const int src_height = 719;
+ const int dst_width = 719;
+ const int dst_height = 1277;
+
+ int err = ARGBTestRotate(src_width, src_height,
+ dst_width, dst_height, kRotate270,
+ benchmark_iterations_);
+ EXPECT_GE(1, err);
+}
+
+} // namespace libyuv
diff --git a/files/unit_test/rotate_test.cc b/files/unit_test/rotate_test.cc
index 1c295b08..788e511e 100644
--- a/files/unit_test/rotate_test.cc
+++ b/files/unit_test/rotate_test.cc
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@@ -8,21 +8,19 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "libyuv/rotate.h"
-#include "../source/rotate_priv.h"
-#include "unit_test.h"
#include <stdlib.h>
#include <time.h>
-using namespace libyuv;
-
-void print_array(uint8 *array, int w, int h) {
- int i, j;
+#include "libyuv/rotate.h"
+#include "../unit_test/unit_test.h"
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; ++j)
- printf("%4d", (signed char)array[(i * w) + j]);
+namespace libyuv {
+void PrintArray(uint8 *array, int w, int h) {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; ++j) {
+ printf("%4d", (signed char)array[i * w + j]);
+ }
printf("\n");
}
}
@@ -31,46 +29,45 @@ TEST_F(libyuvTest, Transpose) {
int iw, ih, ow, oh;
int err = 0;
- for (iw = 8; iw < _rotate_max_w && !err; ++iw)
- for (ih = 8; ih < _rotate_max_h && !err; ++ih) {
+ for (iw = 8; iw < rotate_max_w_ && !err; ++iw) {
+ for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
int i;
- uint8 *input;
- uint8 *output_1;
- uint8 *output_2;
-
ow = ih;
oh = iw;
- input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
- output_1 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
- output_2 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
+ align_buffer_16(input, iw * ih)
+ align_buffer_16(output_1, ow * oh)
+ align_buffer_16(output_2, iw * ih)
- for (i = 0; i < (iw * ih); ++i)
+ for (i = 0; i < iw * ih; ++i) {
input[i] = i;
+ }
TransposePlane(input, iw, output_1, ow, iw, ih);
TransposePlane(output_1, ow, output_2, oh, ow, oh);
- for (i = 0; i < (iw * ih); ++i) {
- if (input[i] != output_2[i])
+ for (i = 0; i < iw * ih; ++i) {
+ if (input[i] != output_2[i]) {
err++;
+ }
}
if (err) {
printf("input %dx%d \n", iw, ih);
- print_array(input, iw, ih);
+ PrintArray(input, iw, ih);
printf("transpose 1\n");
- print_array(output_1, ow, oh);
+ PrintArray(output_1, ow, oh);
printf("transpose 2\n");
- print_array(output_2, iw, ih);
+ PrintArray(output_2, iw, ih);
}
- free(input);
- free(output_1);
- free(output_2);
+ free_aligned_buffer_16(input)
+ free_aligned_buffer_16(output_1)
+ free_aligned_buffer_16(output_2)
}
+ }
EXPECT_EQ(0, err);
}
@@ -79,23 +76,20 @@ TEST_F(libyuvTest, TransposeUV) {
int iw, ih, ow, oh;
int err = 0;
- for (iw = 16; iw < _rotate_max_w && !err; iw += 2)
- for (ih = 8; ih < _rotate_max_h && !err; ++ih) {
+ for (iw = 16; iw < rotate_max_w_ && !err; iw += 2) {
+ for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
int i;
- uint8 *input;
- uint8 *output_a1, *output_b1;
- uint8 *output_a2, *output_b2;
ow = ih;
oh = iw >> 1;
- input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
- output_a1 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
- output_b1 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
- output_a2 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
- output_b2 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
+ align_buffer_16(input, iw * ih)
+ align_buffer_16(output_a1, ow * oh)
+ align_buffer_16(output_b1, ow * oh)
+ align_buffer_16(output_a2, iw * ih)
+ align_buffer_16(output_b2, iw * ih)
- for (i = 0; i < (iw * ih); i += 2) {
+ for (i = 0; i < iw * ih; i += 2) {
input[i] = i >> 1;
input[i + 1] = -(i >> 1);
}
@@ -105,32 +99,35 @@ TEST_F(libyuvTest, TransposeUV) {
TransposePlane(output_a1, ow, output_a2, oh, ow, oh);
TransposePlane(output_b1, ow, output_b2, oh, ow, oh);
- for (i = 0; i < (iw * ih); i += 2) {
- if (input[i] != output_a2[i >> 1])
+ for (i = 0; i < iw * ih; i += 2) {
+ if (input[i] != output_a2[i >> 1]) {
err++;
- if (input[i + 1] != output_b2[i >> 1])
+ }
+ if (input[i + 1] != output_b2[i >> 1]) {
err++;
+ }
}
if (err) {
printf("input %dx%d \n", iw, ih);
- print_array(input, iw, ih);
+ PrintArray(input, iw, ih);
printf("transpose 1\n");
- print_array(output_a1, ow, oh);
- print_array(output_b1, ow, oh);
+ PrintArray(output_a1, ow, oh);
+ PrintArray(output_b1, ow, oh);
printf("transpose 2\n");
- print_array(output_a2, oh, ow);
- print_array(output_b2, oh, ow);
+ PrintArray(output_a2, oh, ow);
+ PrintArray(output_b2, oh, ow);
}
- free(input);
- free(output_a1);
- free(output_b1);
- free(output_a2);
- free(output_b2);
+ free_aligned_buffer_16(input)
+ free_aligned_buffer_16(output_a1)
+ free_aligned_buffer_16(output_b1)
+ free_aligned_buffer_16(output_a2)
+ free_aligned_buffer_16(output_b2)
}
+ }
EXPECT_EQ(0, err);
}
@@ -139,60 +136,58 @@ TEST_F(libyuvTest, RotatePlane90) {
int iw, ih, ow, oh;
int err = 0;
- for (iw = 8; iw < _rotate_max_w && !err; ++iw)
- for (ih = 8; ih < _rotate_max_h && !err; ++ih) {
+ for (iw = 8; iw < rotate_max_w_ && !err; ++iw) {
+ for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
int i;
- uint8 *input;
- uint8 *output_0;
- uint8 *output_90;
- uint8 *output_180;
- uint8 *output_270;
ow = ih;
oh = iw;
- input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
- output_0 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
- output_90 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
- output_180 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
- output_270 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
+ align_buffer_16(input, iw * ih)
+ align_buffer_16(output_0, iw * ih)
+ align_buffer_16(output_90, ow * oh)
+ align_buffer_16(output_180, iw * ih)
+ align_buffer_16(output_270, ow * oh)
- for (i = 0; i < (iw * ih); ++i)
+ for (i = 0; i < iw * ih; ++i) {
input[i] = i;
+ }
RotatePlane90(input, iw, output_90, ow, iw, ih);
RotatePlane90(output_90, ow, output_180, oh, ow, oh);
RotatePlane90(output_180, oh, output_270, ow, oh, ow);
RotatePlane90(output_270, ow, output_0, iw, ow, oh);
- for (i = 0; i < (iw * ih); ++i) {
- if (input[i] != output_0[i])
+ for (i = 0; i < iw * ih; ++i) {
+ if (input[i] != output_0[i]) {
err++;
+ }
}
if (err) {
printf("input %dx%d \n", iw, ih);
- print_array(input, iw, ih);
+ PrintArray(input, iw, ih);
printf("output 90\n");
- print_array(output_90, ow, oh);
+ PrintArray(output_90, ow, oh);
printf("output 180\n");
- print_array(output_180, iw, ih);
+ PrintArray(output_180, iw, ih);
printf("output 270\n");
- print_array(output_270, ow, oh);
+ PrintArray(output_270, ow, oh);
printf("output 0\n");
- print_array(output_0, iw, ih);
+ PrintArray(output_0, iw, ih);
}
- free(input);
- free(output_0);
- free(output_90);
- free(output_180);
- free(output_270);
+ free_aligned_buffer_16(input)
+ free_aligned_buffer_16(output_0)
+ free_aligned_buffer_16(output_90)
+ free_aligned_buffer_16(output_180)
+ free_aligned_buffer_16(output_270)
}
+ }
EXPECT_EQ(0, err);
}
@@ -201,29 +196,22 @@ TEST_F(libyuvTest, RotateUV90) {
int iw, ih, ow, oh;
int err = 0;
- for (iw = 16; iw < _rotate_max_w && !err; iw += 2)
- for (ih = 8; ih < _rotate_max_h && !err; ++ih) {
+ for (iw = 16; iw < rotate_max_w_ && !err; iw += 2) {
+ for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
int i;
- uint8 *input;
- uint8 *output_0_u;
- uint8 *output_0_v;
- uint8 *output_90_u;
- uint8 *output_90_v;
- uint8 *output_180_u;
- uint8 *output_180_v;
ow = ih;
oh = iw >> 1;
- input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
- output_0_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
- output_0_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
- output_90_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
- output_90_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
- output_180_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
- output_180_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
+ align_buffer_16(input, iw * ih)
+ align_buffer_16(output_0_u, ow * oh)
+ align_buffer_16(output_0_v, ow * oh)
+ align_buffer_16(output_90_u, ow * oh)
+ align_buffer_16(output_90_v, ow * oh)
+ align_buffer_16(output_180_u, ow * oh)
+ align_buffer_16(output_180_v, ow * oh)
- for (i = 0; i < (iw * ih); i += 2) {
+ for (i = 0; i < iw * ih; i += 2) {
input[i] = i >> 1;
input[i + 1] = -(i >> 1);
}
@@ -237,43 +225,46 @@ TEST_F(libyuvTest, RotateUV90) {
RotatePlane180(output_180_v, ow, output_0_v, ow, ow, oh);
for (i = 0; i < (ow * oh); ++i) {
- if (output_0_u[i] != (uint8)i)
+ if (output_0_u[i] != (uint8)i) {
err++;
- if (output_0_v[i] != (uint8)(-i))
+ }
+ if (output_0_v[i] != (uint8)(-i)) {
err++;
+ }
}
if (err) {
printf("input %dx%d \n", iw, ih);
- print_array(input, iw, ih);
+ PrintArray(input, iw, ih);
printf("output 90_u\n");
- print_array(output_90_u, ow, oh);
+ PrintArray(output_90_u, ow, oh);
printf("output 90_v\n");
- print_array(output_90_v, ow, oh);
+ PrintArray(output_90_v, ow, oh);
printf("output 180_u\n");
- print_array(output_180_u, oh, ow);
+ PrintArray(output_180_u, oh, ow);
printf("output 180_v\n");
- print_array(output_180_v, oh, ow);
+ PrintArray(output_180_v, oh, ow);
printf("output 0_u\n");
- print_array(output_0_u, oh, ow);
+ PrintArray(output_0_u, oh, ow);
printf("output 0_v\n");
- print_array(output_0_v, oh, ow);
+ PrintArray(output_0_v, oh, ow);
}
- free(input);
- free(output_0_u);
- free(output_0_v);
- free(output_90_u);
- free(output_90_v);
- free(output_180_u);
- free(output_180_v);
+ free_aligned_buffer_16(input)
+ free_aligned_buffer_16(output_0_u)
+ free_aligned_buffer_16(output_0_v)
+ free_aligned_buffer_16(output_90_u)
+ free_aligned_buffer_16(output_90_v)
+ free_aligned_buffer_16(output_180_u)
+ free_aligned_buffer_16(output_180_v)
}
+ }
EXPECT_EQ(0, err);
}
@@ -282,29 +273,22 @@ TEST_F(libyuvTest, RotateUV180) {
int iw, ih, ow, oh;
int err = 0;
- for (iw = 16; iw < _rotate_max_w && !err; iw += 2)
- for (ih = 8; ih < _rotate_max_h && !err; ++ih) {
+ for (iw = 16; iw < rotate_max_w_ && !err; iw += 2) {
+ for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
int i;
- uint8 *input;
- uint8 *output_0_u;
- uint8 *output_0_v;
- uint8 *output_90_u;
- uint8 *output_90_v;
- uint8 *output_180_u;
- uint8 *output_180_v;
ow = iw >> 1;
oh = ih;
- input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
- output_0_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
- output_0_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
- output_90_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
- output_90_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
- output_180_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
- output_180_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
+ align_buffer_16(input, iw * ih)
+ align_buffer_16(output_0_u, ow * oh)
+ align_buffer_16(output_0_v, ow * oh)
+ align_buffer_16(output_90_u, ow * oh)
+ align_buffer_16(output_90_v, ow * oh)
+ align_buffer_16(output_180_u, ow * oh)
+ align_buffer_16(output_180_v, ow * oh)
- for (i = 0; i < (iw * ih); i += 2) {
+ for (i = 0; i < iw * ih; i += 2) {
input[i] = i >> 1;
input[i + 1] = -(i >> 1);
}
@@ -318,43 +302,46 @@ TEST_F(libyuvTest, RotateUV180) {
RotatePlane90(output_90_v, oh, output_0_v, ow, oh, ow);
for (i = 0; i < (ow * oh); ++i) {
- if (output_0_u[i] != (uint8)i)
+ if (output_0_u[i] != (uint8)i) {
err++;
- if (output_0_v[i] != (uint8)(-i))
+ }
+ if (output_0_v[i] != (uint8)(-i)) {
err++;
+ }
}
if (err) {
printf("input %dx%d \n", iw, ih);
- print_array(input, iw, ih);
+ PrintArray(input, iw, ih);
printf("output 180_u\n");
- print_array(output_180_u, oh, ow);
+ PrintArray(output_180_u, oh, ow);
printf("output 180_v\n");
- print_array(output_180_v, oh, ow);
+ PrintArray(output_180_v, oh, ow);
printf("output 90_u\n");
- print_array(output_90_u, oh, ow);
+ PrintArray(output_90_u, oh, ow);
printf("output 90_v\n");
- print_array(output_90_v, oh, ow);
+ PrintArray(output_90_v, oh, ow);
printf("output 0_u\n");
- print_array(output_0_u, ow, oh);
+ PrintArray(output_0_u, ow, oh);
printf("output 0_v\n");
- print_array(output_0_v, ow, oh);
+ PrintArray(output_0_v, ow, oh);
}
- free(input);
- free(output_0_u);
- free(output_0_v);
- free(output_90_u);
- free(output_90_v);
- free(output_180_u);
- free(output_180_v);
+ free_aligned_buffer_16(input)
+ free_aligned_buffer_16(output_0_u)
+ free_aligned_buffer_16(output_0_v)
+ free_aligned_buffer_16(output_90_u)
+ free_aligned_buffer_16(output_90_v)
+ free_aligned_buffer_16(output_180_u)
+ free_aligned_buffer_16(output_180_v)
}
+ }
EXPECT_EQ(0, err);
}
@@ -363,29 +350,22 @@ TEST_F(libyuvTest, RotateUV270) {
int iw, ih, ow, oh;
int err = 0;
- for (iw = 16; iw < _rotate_max_w && !err; iw += 2)
- for (ih = 8; ih < _rotate_max_h && !err; ++ih) {
+ for (iw = 16; iw < rotate_max_w_ && !err; iw += 2) {
+ for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
int i;
- uint8 *input;
- uint8 *output_0_u;
- uint8 *output_0_v;
- uint8 *output_270_u;
- uint8 *output_270_v;
- uint8 *output_180_u;
- uint8 *output_180_v;
ow = ih;
oh = iw >> 1;
- input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
- output_0_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
- output_0_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
- output_270_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
- output_270_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
- output_180_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
- output_180_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
+ align_buffer_16(input, iw * ih)
+ align_buffer_16(output_0_u, ow * oh)
+ align_buffer_16(output_0_v, ow * oh)
+ align_buffer_16(output_270_u, ow * oh)
+ align_buffer_16(output_270_v, ow * oh)
+ align_buffer_16(output_180_u, ow * oh)
+ align_buffer_16(output_180_v, ow * oh)
- for (i = 0; i < (iw * ih); i += 2) {
+ for (i = 0; i < iw * ih; i += 2) {
input[i] = i >> 1;
input[i + 1] = -(i >> 1);
}
@@ -400,43 +380,46 @@ TEST_F(libyuvTest, RotateUV270) {
RotatePlane180(output_180_v, ow, output_0_v, ow, ow, oh);
for (i = 0; i < (ow * oh); ++i) {
- if (output_0_u[i] != (uint8)i)
+ if (output_0_u[i] != (uint8)i) {
err++;
- if (output_0_v[i] != (uint8)(-i))
+ }
+ if (output_0_v[i] != (uint8)(-i)) {
err++;
+ }
}
if (err) {
printf("input %dx%d \n", iw, ih);
- print_array(input, iw, ih);
+ PrintArray(input, iw, ih);
printf("output 270_u\n");
- print_array(output_270_u, ow, oh);
+ PrintArray(output_270_u, ow, oh);
printf("output 270_v\n");
- print_array(output_270_v, ow, oh);
+ PrintArray(output_270_v, ow, oh);
printf("output 180_u\n");
- print_array(output_180_u, oh, ow);
+ PrintArray(output_180_u, oh, ow);
printf("output 180_v\n");
- print_array(output_180_v, oh, ow);
+ PrintArray(output_180_v, oh, ow);
printf("output 0_u\n");
- print_array(output_0_u, oh, ow);
+ PrintArray(output_0_u, oh, ow);
printf("output 0_v\n");
- print_array(output_0_v, oh, ow);
+ PrintArray(output_0_v, oh, ow);
}
- free(input);
- free(output_0_u);
- free(output_0_v);
- free(output_270_u);
- free(output_270_v);
- free(output_180_u);
- free(output_180_v);
+ free_aligned_buffer_16(input)
+ free_aligned_buffer_16(output_0_u)
+ free_aligned_buffer_16(output_0_v)
+ free_aligned_buffer_16(output_270_u)
+ free_aligned_buffer_16(output_270_v)
+ free_aligned_buffer_16(output_180_u)
+ free_aligned_buffer_16(output_180_v)
}
+ }
EXPECT_EQ(0, err);
}
@@ -445,45 +428,44 @@ TEST_F(libyuvTest, RotatePlane180) {
int iw, ih, ow, oh;
int err = 0;
- for (iw = 8; iw < _rotate_max_w && !err; ++iw)
- for (ih = 8; ih < _rotate_max_h && !err; ++ih) {
+ for (iw = 8; iw < rotate_max_w_ && !err; ++iw)
+ for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
int i;
- uint8 *input;
- uint8 *output_0;
- uint8 *output_180;
ow = iw;
oh = ih;
- input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
- output_0 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
- output_180 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
+ align_buffer_16(input, iw * ih)
+ align_buffer_16(output_0, iw * ih)
+ align_buffer_16(output_180, iw * ih)
- for (i = 0; i < (iw * ih); ++i)
+ for (i = 0; i < iw * ih; ++i) {
input[i] = i;
+ }
RotatePlane180(input, iw, output_180, ow, iw, ih);
RotatePlane180(output_180, ow, output_0, iw, ow, oh);
- for (i = 0; i < (iw * ih); ++i) {
- if (input[i] != output_0[i])
+ for (i = 0; i < iw * ih; ++i) {
+ if (input[i] != output_0[i]) {
err++;
+ }
}
if (err) {
printf("input %dx%d \n", iw, ih);
- print_array(input, iw, ih);
+ PrintArray(input, iw, ih);
printf("output 180\n");
- print_array(output_180, iw, ih);
+ PrintArray(output_180, iw, ih);
printf("output 0\n");
- print_array(output_0, iw, ih);
+ PrintArray(output_0, iw, ih);
}
- free(input);
- free(output_0);
- free(output_180);
+ free_aligned_buffer_16(input)
+ free_aligned_buffer_16(output_0)
+ free_aligned_buffer_16(output_180)
}
EXPECT_EQ(0, err);
@@ -493,25 +475,20 @@ TEST_F(libyuvTest, RotatePlane270) {
int iw, ih, ow, oh;
int err = 0;
- for (iw = 8; iw < _rotate_max_w && !err; ++iw)
- for (ih = 8; ih < _rotate_max_h && !err; ++ih) {
+ for (iw = 8; iw < rotate_max_w_ && !err; ++iw) {
+ for (ih = 8; ih < rotate_max_h_ && !err; ++ih) {
int i;
- uint8 *input;
- uint8 *output_0;
- uint8 *output_90;
- uint8 *output_180;
- uint8 *output_270;
ow = ih;
oh = iw;
- input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
- output_0 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
- output_90 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
- output_180 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
- output_270 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
+ align_buffer_16(input, iw * ih)
+ align_buffer_16(output_0, iw * ih)
+ align_buffer_16(output_90, ow * oh)
+ align_buffer_16(output_180, iw * ih)
+ align_buffer_16(output_270, ow * oh)
- for (i = 0; i < (iw * ih); ++i)
+ for (i = 0; i < iw * ih; ++i)
input[i] = i;
RotatePlane270(input, iw, output_270, ow, iw, ih);
@@ -519,34 +496,36 @@ TEST_F(libyuvTest, RotatePlane270) {
RotatePlane270(output_180, oh, output_90, ow, oh, ow);
RotatePlane270(output_90, ow, output_0, iw, ow, oh);
- for (i = 0; i < (iw * ih); ++i) {
- if (input[i] != output_0[i])
+ for (i = 0; i < iw * ih; ++i) {
+ if (input[i] != output_0[i]) {
err++;
+ }
}
if (err) {
printf("input %dx%d \n", iw, ih);
- print_array(input, iw, ih);
+ PrintArray(input, iw, ih);
printf("output 270\n");
- print_array(output_270, ow, oh);
+ PrintArray(output_270, ow, oh);
printf("output 180\n");
- print_array(output_180, iw, ih);
+ PrintArray(output_180, iw, ih);
printf("output 90\n");
- print_array(output_90, ow, oh);
+ PrintArray(output_90, ow, oh);
printf("output 0\n");
- print_array(output_0, iw, ih);
+ PrintArray(output_0, iw, ih);
}
- free(input);
- free(output_0);
- free(output_90);
- free(output_180);
- free(output_270);
+ free_aligned_buffer_16(input)
+ free_aligned_buffer_16(output_0)
+ free_aligned_buffer_16(output_90)
+ free_aligned_buffer_16(output_180)
+ free_aligned_buffer_16(output_270)
}
+ }
EXPECT_EQ(0, err);
}
@@ -555,44 +534,44 @@ TEST_F(libyuvTest, RotatePlane90and270) {
int iw, ih, ow, oh;
int err = 0;
- for (iw = 16; iw < _rotate_max_w && !err; iw += 4)
- for (ih = 16; ih < _rotate_max_h && !err; ih += 4) {
+ for (iw = 16; iw < rotate_max_w_ && !err; iw += 4)
+ for (ih = 16; ih < rotate_max_h_ && !err; ih += 4) {
int i;
- uint8 *input;
- uint8 *output_0;
- uint8 *output_90;
+
ow = ih;
oh = iw;
- input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
- output_0 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
- output_90 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
+ align_buffer_16(input, iw * ih)
+ align_buffer_16(output_0, iw * ih)
+ align_buffer_16(output_90, ow * oh)
- for (i = 0; i < (iw * ih); ++i)
+ for (i = 0; i < iw * ih; ++i) {
input[i] = i;
+ }
RotatePlane90(input, iw, output_90, ow, iw, ih);
RotatePlane270(output_90, ow, output_0, iw, ow, oh);
- for (i = 0; i < (iw * ih); ++i) {
- if (input[i] != output_0[i])
+ for (i = 0; i < iw * ih; ++i) {
+ if (input[i] != output_0[i]) {
err++;
+ }
}
if (err) {
printf("intput %dx%d\n", iw, ih);
- print_array(input, iw, ih);
+ PrintArray(input, iw, ih);
printf("output \n");
- print_array(output_90, ow, oh);
+ PrintArray(output_90, ow, oh);
printf("output \n");
- print_array(output_0, iw, ih);
+ PrintArray(output_0, iw, ih);
}
- free(input);
- free(output_0);
- free(output_90);
+ free_aligned_buffer_16(input)
+ free_aligned_buffer_16(output_0)
+ free_aligned_buffer_16(output_90)
}
EXPECT_EQ(0, err);
@@ -602,21 +581,20 @@ TEST_F(libyuvTest, RotatePlane90Pitch) {
int iw, ih;
int err = 0;
- for (iw = 16; iw < _rotate_max_w && !err; iw += 4)
- for (ih = 16; ih < _rotate_max_h && !err; ih += 4) {
+ for (iw = 16; iw < rotate_max_w_ && !err; iw += 4)
+ for (ih = 16; ih < rotate_max_h_ && !err; ih += 4) {
int i;
- uint8 *input;
- uint8 *output_0;
- uint8 *output_90;
+
int ow = ih;
int oh = iw;
- input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
- output_0 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
- output_90 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
+ align_buffer_16(input, iw * ih)
+ align_buffer_16(output_0, iw * ih)
+ align_buffer_16(output_90, ow * oh)
- for (i = 0; i < (iw * ih); ++i)
+ for (i = 0; i < iw * ih; ++i) {
input[i] = i;
+ }
RotatePlane90(input, iw,
output_90 + (ow >> 1), ow,
@@ -633,25 +611,26 @@ TEST_F(libyuvTest, RotatePlane90Pitch) {
RotatePlane270(output_90, ih, output_0, iw, ow, oh);
- for (i = 0; i < (iw * ih); ++i) {
- if (input[i] != output_0[i])
+ for (i = 0; i < iw * ih; ++i) {
+ if (input[i] != output_0[i]) {
err++;
+ }
}
if (err) {
printf("intput %dx%d\n", iw, ih);
- print_array(input, iw, ih);
+ PrintArray(input, iw, ih);
printf("output \n");
- print_array(output_90, ow, oh);
+ PrintArray(output_90, ow, oh);
printf("output \n");
- print_array(output_0, iw, ih);
+ PrintArray(output_0, iw, ih);
}
- free(input);
- free(output_0);
- free(output_90);
+ free_aligned_buffer_16(input)
+ free_aligned_buffer_16(output_0)
+ free_aligned_buffer_16(output_90)
}
EXPECT_EQ(0, err);
@@ -661,22 +640,20 @@ TEST_F(libyuvTest, RotatePlane270Pitch) {
int iw, ih, ow, oh;
int err = 0;
- for (iw = 16; iw < _rotate_max_w && !err; iw += 4)
- for (ih = 16; ih < _rotate_max_h && !err; ih += 4) {
+ for (iw = 16; iw < rotate_max_w_ && !err; iw += 4) {
+ for (ih = 16; ih < rotate_max_h_ && !err; ih += 4) {
int i;
- uint8 *input;
- uint8 *output_0;
- uint8 *output_270;
ow = ih;
oh = iw;
- input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
- output_0 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
- output_270 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
+ align_buffer_16(input, iw * ih)
+ align_buffer_16(output_0, iw * ih)
+ align_buffer_16(output_270, ow * oh)
- for (i = 0; i < (iw * ih); ++i)
+ for (i = 0; i < iw * ih; ++i) {
input[i] = i;
+ }
RotatePlane270(input, iw,
output_270 + ow * (oh >> 1), ow,
@@ -693,36 +670,34 @@ TEST_F(libyuvTest, RotatePlane270Pitch) {
RotatePlane90(output_270, ih, output_0, iw, ow, oh);
- for (i = 0; i < (iw * ih); ++i) {
- if (input[i] != output_0[i])
+ for (i = 0; i < iw * ih; ++i) {
+ if (input[i] != output_0[i]) {
err++;
+ }
}
if (err) {
printf("intput %dx%d\n", iw, ih);
- print_array(input, iw, ih);
+ PrintArray(input, iw, ih);
printf("output \n");
- print_array(output_270, ow, oh);
+ PrintArray(output_270, ow, oh);
printf("output \n");
- print_array(output_0, iw, ih);
+ PrintArray(output_0, iw, ih);
}
- free(input);
- free(output_0);
- free(output_270);
+ free_aligned_buffer_16(input)
+ free_aligned_buffer_16(output_0)
+ free_aligned_buffer_16(output_270)
}
+ }
EXPECT_EQ(0, err);
}
TEST_F(libyuvTest, I420Rotate90) {
int err = 0;
- uint8 *orig_y, *orig_u, *orig_v;
- uint8 *ro0_y, *ro0_u, *ro0_v;
- uint8 *ro90_y, *ro90_u, *ro90_v;
- uint8 *ro270_y, *ro270_u, *ro270_v;
int yw = 1024;
int yh = 768;
@@ -732,50 +707,59 @@ TEST_F(libyuvTest, I420Rotate90) {
int i, j;
- int y_plane_size = (yw + (2 * b)) * (yh + (2 * b));
- int uv_plane_size = (uvw + (2 * b)) * (uvh + (2 * b));
+ int y_plane_size = (yw + b * 2) * (yh + b * 2);
+ int uv_plane_size = (uvw + b * 2) * (uvh + b * 2);
srandom(time(NULL));
- orig_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
- orig_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
- orig_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-
- ro0_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
- ro0_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
- ro0_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-
- ro90_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
- ro90_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
- ro90_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-
- ro270_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
- ro270_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
- ro270_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
+ align_buffer_16(orig_y, y_plane_size)
+ align_buffer_16(orig_u, uv_plane_size)
+ align_buffer_16(orig_v, uv_plane_size)
+ align_buffer_16(ro0_y, y_plane_size)
+ align_buffer_16(ro0_u, uv_plane_size)
+ align_buffer_16(ro0_v, uv_plane_size)
+ align_buffer_16(ro90_y, y_plane_size)
+ align_buffer_16(ro90_u, uv_plane_size)
+ align_buffer_16(ro90_v, uv_plane_size)
+ align_buffer_16(ro270_y, y_plane_size)
+ align_buffer_16(ro270_u, uv_plane_size)
+ align_buffer_16(ro270_v, uv_plane_size)
+ memset(orig_y, 0, y_plane_size);
+ memset(orig_u, 0, uv_plane_size);
+ memset(orig_v, 0, uv_plane_size);
+ memset(ro0_y, 0, y_plane_size);
+ memset(ro0_u, 0, uv_plane_size);
+ memset(ro0_v, 0, uv_plane_size);
+ memset(ro90_y, 0, y_plane_size);
+ memset(ro90_u, 0, uv_plane_size);
+ memset(ro90_v, 0, uv_plane_size);
+ memset(ro270_y, 0, y_plane_size);
+ memset(ro270_u, 0, uv_plane_size);
+ memset(ro270_v, 0, uv_plane_size);
// fill image buffers with random data
for (i = b; i < (yh + b); ++i) {
for (j = b; j < (yw + b); ++j) {
- orig_y[i * (yw + (2 * b)) + j] = random() & 0xff;
+ orig_y[i * (yw + b * 2) + j] = random() & 0xff;
}
}
for (i = b; i < (uvh + b); ++i) {
for (j = b; j < (uvw + b); ++j) {
- orig_u[i * (uvw + (2 * b)) + j] = random() & 0xff;
- orig_v[i * (uvw + (2 * b)) + j] = random() & 0xff;
+ orig_u[i * (uvw + b * 2) + j] = random() & 0xff;
+ orig_v[i * (uvw + b * 2) + j] = random() & 0xff;
}
}
- int y_off_0 = b * (yw + (2 * b)) + b;
- int uv_off_0 = b * (uvw + (2 * b)) + b;
- int y_off_90 = b * (yh + (2 * b)) + b;
- int uv_off_90 = b * (uvh + (2 * b)) + b;
+ int y_off_0 = b * (yw + b * 2) + b;
+ int uv_off_0 = b * (uvw + b * 2) + b;
+ int y_off_90 = b * (yh + b * 2) + b;
+ int uv_off_90 = b * (uvh + b * 2) + b;
- int y_st_0 = yw + (2 * b);
- int uv_st_0 = uvw + (2 * b);
- int y_st_90 = yh + (2 * b);
- int uv_st_90 = uvh + (2 * b);
+ int y_st_0 = yw + b * 2;
+ int uv_st_0 = uvw + b * 2;
+ int y_st_90 = yh + b * 2;
+ int uv_st_90 = uvh + b * 2;
I420Rotate(orig_y+y_off_0, y_st_0,
orig_u+uv_off_0, uv_st_0,
@@ -805,39 +789,38 @@ TEST_F(libyuvTest, I420Rotate90) {
kRotateClockwise);
for (i = 0; i < y_plane_size; ++i) {
- if (orig_y[i] != ro0_y[i])
+ if (orig_y[i] != ro0_y[i]) {
++err;
+ }
}
for (i = 0; i < uv_plane_size; ++i) {
- if (orig_u[i] != ro0_u[i])
+ if (orig_u[i] != ro0_u[i]) {
++err;
- if (orig_v[i] != ro0_v[i])
+ }
+ if (orig_v[i] != ro0_v[i]) {
++err;
+ }
}
- free(orig_y);
- free(orig_u);
- free(orig_v);
- free(ro0_y);
- free(ro0_u);
- free(ro0_v);
- free(ro90_y);
- free(ro90_u);
- free(ro90_v);
- free(ro270_y);
- free(ro270_u);
- free(ro270_v);
+ free_aligned_buffer_16(orig_y)
+ free_aligned_buffer_16(orig_u)
+ free_aligned_buffer_16(orig_v)
+ free_aligned_buffer_16(ro0_y)
+ free_aligned_buffer_16(ro0_u)
+ free_aligned_buffer_16(ro0_v)
+ free_aligned_buffer_16(ro90_y)
+ free_aligned_buffer_16(ro90_u)
+ free_aligned_buffer_16(ro90_v)
+ free_aligned_buffer_16(ro270_y)
+ free_aligned_buffer_16(ro270_u)
+ free_aligned_buffer_16(ro270_v)
EXPECT_EQ(0, err);
}
TEST_F(libyuvTest, I420Rotate270) {
int err = 0;
- uint8 *orig_y, *orig_u, *orig_v;
- uint8 *ro0_y, *ro0_u, *ro0_v;
- uint8 *ro90_y, *ro90_u, *ro90_v;
- uint8 *ro270_y, *ro270_u, *ro270_v;
int yw = 1024;
int yh = 768;
@@ -847,50 +830,59 @@ TEST_F(libyuvTest, I420Rotate270) {
int i, j;
- int y_plane_size = (yw + (2 * b)) * (yh + (2 * b));
- int uv_plane_size = (uvw + (2 * b)) * (uvh + (2 * b));
+ int y_plane_size = (yw + b * 2) * (yh + b * 2);
+ int uv_plane_size = (uvw + b * 2) * (uvh + b * 2);
srandom(time(NULL));
- orig_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
- orig_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
- orig_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-
- ro0_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
- ro0_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
- ro0_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-
- ro90_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
- ro90_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
- ro90_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-
- ro270_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
- ro270_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
- ro270_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
+ align_buffer_16(orig_y, y_plane_size)
+ align_buffer_16(orig_u, uv_plane_size)
+ align_buffer_16(orig_v, uv_plane_size)
+ align_buffer_16(ro0_y, y_plane_size)
+ align_buffer_16(ro0_u, uv_plane_size)
+ align_buffer_16(ro0_v, uv_plane_size)
+ align_buffer_16(ro90_y, y_plane_size)
+ align_buffer_16(ro90_u, uv_plane_size)
+ align_buffer_16(ro90_v, uv_plane_size)
+ align_buffer_16(ro270_y, y_plane_size)
+ align_buffer_16(ro270_u, uv_plane_size)
+ align_buffer_16(ro270_v, uv_plane_size)
+ memset(orig_y, 0, y_plane_size);
+ memset(orig_u, 0, uv_plane_size);
+ memset(orig_v, 0, uv_plane_size);
+ memset(ro0_y, 0, y_plane_size);
+ memset(ro0_u, 0, uv_plane_size);
+ memset(ro0_v, 0, uv_plane_size);
+ memset(ro90_y, 0, y_plane_size);
+ memset(ro90_u, 0, uv_plane_size);
+ memset(ro90_v, 0, uv_plane_size);
+ memset(ro270_y, 0, y_plane_size);
+ memset(ro270_u, 0, uv_plane_size);
+ memset(ro270_v, 0, uv_plane_size);
// fill image buffers with random data
for (i = b; i < (yh + b); ++i) {
for (j = b; j < (yw + b); ++j) {
- orig_y[i * (yw + (2 * b)) + j] = random() & 0xff;
+ orig_y[i * (yw + b * 2) + j] = random() & 0xff;
}
}
for (i = b; i < (uvh + b); ++i) {
for (j = b; j < (uvw + b); ++j) {
- orig_u[i * (uvw + (2 * b)) + j] = random() & 0xff;
- orig_v[i * (uvw + (2 * b)) + j] = random() & 0xff;
+ orig_u[i * (uvw + b * 2) + j] = random() & 0xff;
+ orig_v[i * (uvw + b * 2) + j] = random() & 0xff;
}
}
- int y_off_0 = b * (yw + (2 * b)) + b;
- int uv_off_0 = b * (uvw + (2 * b)) + b;
- int y_off_90 = b * (yh + (2 * b)) + b;
- int uv_off_90 = b * (uvh + (2 * b)) + b;
+ int y_off_0 = b * (yw + b * 2) + b;
+ int uv_off_0 = b * (uvw + b * 2) + b;
+ int y_off_90 = b * (yh + b * 2) + b;
+ int uv_off_90 = b * (uvh + b * 2) + b;
- int y_st_0 = yw + (2 * b);
- int uv_st_0 = uvw + (2 * b);
- int y_st_90 = yh + (2 * b);
- int uv_st_90 = uvh + (2 * b);
+ int y_st_0 = yw + b * 2;
+ int uv_st_0 = uvw + b * 2;
+ int y_st_90 = yh + b * 2;
+ int uv_st_90 = uvh + b * 2;
I420Rotate(orig_y+y_off_0, y_st_0,
orig_u+uv_off_0, uv_st_0,
@@ -920,38 +912,38 @@ TEST_F(libyuvTest, I420Rotate270) {
kRotateCounterClockwise);
for (i = 0; i < y_plane_size; ++i) {
- if (orig_y[i] != ro0_y[i])
+ if (orig_y[i] != ro0_y[i]) {
++err;
+ }
}
for (i = 0; i < uv_plane_size; ++i) {
- if (orig_u[i] != ro0_u[i])
+ if (orig_u[i] != ro0_u[i]) {
++err;
- if (orig_v[i] != ro0_v[i])
+ }
+ if (orig_v[i] != ro0_v[i]) {
++err;
+ }
}
- free(orig_y);
- free(orig_u);
- free(orig_v);
- free(ro0_y);
- free(ro0_u);
- free(ro0_v);
- free(ro90_y);
- free(ro90_u);
- free(ro90_v);
- free(ro270_y);
- free(ro270_u);
- free(ro270_v);
+ free_aligned_buffer_16(orig_y)
+ free_aligned_buffer_16(orig_u)
+ free_aligned_buffer_16(orig_v)
+ free_aligned_buffer_16(ro0_y)
+ free_aligned_buffer_16(ro0_u)
+ free_aligned_buffer_16(ro0_v)
+ free_aligned_buffer_16(ro90_y)
+ free_aligned_buffer_16(ro90_u)
+ free_aligned_buffer_16(ro90_v)
+ free_aligned_buffer_16(ro270_y)
+ free_aligned_buffer_16(ro270_u)
+ free_aligned_buffer_16(ro270_v)
EXPECT_EQ(0, err);
}
TEST_F(libyuvTest, NV12ToI420Rotate90) {
int err = 0;
- uint8 *orig_y, *orig_uv;
- uint8 *ro0_y, *ro0_u, *ro0_v;
- uint8 *ro90_y, *ro90_u, *ro90_v;
int yw = 1024;
int yh = 768;
@@ -960,47 +952,53 @@ TEST_F(libyuvTest, NV12ToI420Rotate90) {
int uvh = (yh + 1) >> 1;
int i, j;
- int y_plane_size = (yw + (2 * b)) * (yh + (2 * b));
- int uv_plane_size = (uvw + (2 * b)) * (uvh + (2 * b));
- int o_uv_plane_size = ((2 * uvw) + (2 * b)) * (uvh + (2 * b));
+ int y_plane_size = (yw + b * 2) * (yh + b * 2);
+ int uv_plane_size = (uvw + b * 2) * (uvh + b * 2);
+ int o_uv_plane_size = (uvw * 2 + b * 2) * (uvh + b * 2);
srandom(time(NULL));
- orig_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
- orig_uv = static_cast<uint8*>(calloc(o_uv_plane_size, sizeof(uint8)));
-
- ro0_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
- ro0_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
- ro0_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-
- ro90_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
- ro90_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
- ro90_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
+ align_buffer_16(orig_y, y_plane_size)
+ align_buffer_16(orig_uv, o_uv_plane_size)
+ align_buffer_16(ro0_y, y_plane_size)
+ align_buffer_16(ro0_u, uv_plane_size)
+ align_buffer_16(ro0_v, uv_plane_size)
+ align_buffer_16(ro90_y, y_plane_size)
+ align_buffer_16(ro90_u, uv_plane_size)
+ align_buffer_16(ro90_v, uv_plane_size)
+ memset(orig_y, 0, y_plane_size);
+ memset(orig_uv, 0, uv_plane_size);
+ memset(ro0_y, 0, y_plane_size);
+ memset(ro0_u, 0, uv_plane_size);
+ memset(ro0_v, 0, uv_plane_size);
+ memset(ro90_y, 0, y_plane_size);
+ memset(ro90_u, 0, uv_plane_size);
+ memset(ro90_v, 0, uv_plane_size);
// fill image buffers with random data
for (i = b; i < (yh + b); ++i) {
for (j = b; j < (yw + b); ++j) {
- orig_y[i * (yw + (2 * b)) + j] = random() & 0xff;
+ orig_y[i * (yw + b * 2) + j] = random() & 0xff;
}
}
for (i = b; i < (uvh + b); ++i) {
- for (j = b; j < ((2 * uvw) + b); j += 2) {
+ for (j = b; j < (uvw * 2 + b); j += 2) {
uint8 random_number = random() & 0x7f;
- orig_uv[i * ((2 * uvw) + (2 * b)) + j] = random_number;
- orig_uv[i * ((2 * uvw) + (2 * b)) + j + 1] = -random_number;
+ orig_uv[i * (uvw * 2 + b * 2) + j] = random_number;
+ orig_uv[i * (uvw * 2 + b * 2) + j + 1] = -random_number;
}
}
- int y_off_0 = b * (yw + (2 * b)) + b;
- int uv_off_0 = b * (uvw + (2 * b)) + b;
- int y_off_90 = b * (yh + (2 * b)) + b;
- int uv_off_90 = b * (uvh + (2 * b)) + b;
+ int y_off_0 = b * (yw + b * 2) + b;
+ int uv_off_0 = b * (uvw + b * 2) + b;
+ int y_off_90 = b * (yh + b * 2) + b;
+ int uv_off_90 = b * (uvh + b * 2) + b;
- int y_st_0 = yw + (2 * b);
- int uv_st_0 = uvw + (2 * b);
- int y_st_90 = yh + (2 * b);
- int uv_st_90 = uvh + (2 * b);
+ int y_st_0 = yw + b * 2;
+ int uv_st_0 = uvw + b * 2;
+ int y_st_90 = yh + b * 2;
+ int uv_st_90 = uvh + b * 2;
NV12ToI420Rotate(orig_y+y_off_0, y_st_0,
orig_uv+y_off_0, y_st_0,
@@ -1027,32 +1025,32 @@ TEST_F(libyuvTest, NV12ToI420Rotate90) {
int zero_cnt = 0;
for (i = 0; i < uv_plane_size; ++i) {
- if ((signed char)ro0_u[i] != -(signed char)ro0_v[i])
+ if ((signed char)ro0_u[i] != -(signed char)ro0_v[i]) {
++err;
- if (ro0_u[i] != 0)
+ }
+ if (ro0_u[i] != 0) {
++zero_cnt;
+ }
}
- if (!zero_cnt)
+ if (!zero_cnt) {
++err;
+ }
- free(orig_y);
- free(orig_uv);
- free(ro0_y);
- free(ro0_u);
- free(ro0_v);
- free(ro90_y);
- free(ro90_u);
- free(ro90_v);
+ free_aligned_buffer_16(orig_y)
+ free_aligned_buffer_16(orig_uv)
+ free_aligned_buffer_16(ro0_y)
+ free_aligned_buffer_16(ro0_u)
+ free_aligned_buffer_16(ro0_v)
+ free_aligned_buffer_16(ro90_y)
+ free_aligned_buffer_16(ro90_u)
+ free_aligned_buffer_16(ro90_v)
EXPECT_EQ(0, err);
}
TEST_F(libyuvTest, NV12ToI420Rotate270) {
int err = 0;
- uint8 *orig_y, *orig_uv;
- uint8 *ro0_y, *ro0_u, *ro0_v;
- uint8 *ro270_y, *ro270_u, *ro270_v;
int yw = 1024;
int yh = 768;
@@ -1062,47 +1060,53 @@ TEST_F(libyuvTest, NV12ToI420Rotate270) {
int i, j;
- int y_plane_size = (yw + (2 * b)) * (yh + (2 * b));
- int uv_plane_size = (uvw + (2 * b)) * (uvh + (2 * b));
- int o_uv_plane_size = ((2 * uvw) + (2 * b)) * (uvh + (2 * b));
+ int y_plane_size = (yw + b * 2) * (yh + b * 2);
+ int uv_plane_size = (uvw + b * 2) * (uvh + b * 2);
+ int o_uv_plane_size = (uvw * 2 + b * 2) * (uvh + b * 2);
srandom(time(NULL));
- orig_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
- orig_uv = static_cast<uint8*>(calloc(o_uv_plane_size, sizeof(uint8)));
-
- ro0_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
- ro0_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
- ro0_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-
- ro270_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
- ro270_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
- ro270_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
+ align_buffer_16(orig_y, y_plane_size)
+ align_buffer_16(orig_uv, o_uv_plane_size)
+ align_buffer_16(ro0_y, y_plane_size)
+ align_buffer_16(ro0_u, uv_plane_size)
+ align_buffer_16(ro0_v, uv_plane_size)
+ align_buffer_16(ro270_y, y_plane_size)
+ align_buffer_16(ro270_u, uv_plane_size)
+ align_buffer_16(ro270_v, uv_plane_size)
+ memset(orig_y, 0, y_plane_size);
+ memset(orig_uv, 0, o_uv_plane_size);
+ memset(ro0_y, 0, y_plane_size);
+ memset(ro0_u, 0, uv_plane_size);
+ memset(ro0_v, 0, uv_plane_size);
+ memset(ro270_y, 0, y_plane_size);
+ memset(ro270_u, 0, uv_plane_size);
+ memset(ro270_v, 0, uv_plane_size);
// fill image buffers with random data
for (i = b; i < (yh + b); ++i) {
for (j = b; j < (yw + b); ++j) {
- orig_y[i * (yw + (2 * b)) + j] = random() & 0xff;
+ orig_y[i * (yw + b * 2) + j] = random() & 0xff;
}
}
for (i = b; i < (uvh + b); ++i) {
- for (j = b; j < ((2 * uvw) + b); j += 2) {
+ for (j = b; j < (uvw * 2 + b); j += 2) {
uint8 random_number = random() & 0x7f;
- orig_uv[i * ((2 * uvw) + (2 * b)) + j] = random_number;
- orig_uv[i * ((2 * uvw) + (2 * b)) + j + 1] = -random_number;
+ orig_uv[i * (uvw * 2 + b * 2) + j] = random_number;
+ orig_uv[i * (uvw * 2 + b * 2) + j + 1] = -random_number;
}
}
- int y_off_0 = b * (yw + (2 * b)) + b;
- int uv_off_0 = b * (uvw + (2 * b)) + b;
- int y_off_270 = b * (yh + (2 * b)) + b;
- int uv_off_270 = b * (uvh + (2 * b)) + b;
+ int y_off_0 = b * (yw + b * 2) + b;
+ int uv_off_0 = b * (uvw + b * 2) + b;
+ int y_off_270 = b * (yh + b * 2) + b;
+ int uv_off_270 = b * (uvh + b * 2) + b;
- int y_st_0 = yw + (2 * b);
- int uv_st_0 = uvw + (2 * b);
- int y_st_270 = yh + (2 * b);
- int uv_st_270 = uvh + (2 * b);
+ int y_st_0 = yw + b * 2;
+ int uv_st_0 = uvw + b * 2;
+ int y_st_270 = yh + b * 2;
+ int uv_st_270 = uvh + b * 2;
NV12ToI420Rotate(orig_y+y_off_0, y_st_0,
orig_uv+y_off_0, y_st_0,
@@ -1129,32 +1133,32 @@ TEST_F(libyuvTest, NV12ToI420Rotate270) {
int zero_cnt = 0;
for (i = 0; i < uv_plane_size; ++i) {
- if ((signed char)ro0_u[i] != -(signed char)ro0_v[i])
+ if ((signed char)ro0_u[i] != -(signed char)ro0_v[i]) {
++err;
- if (ro0_u[i] != 0)
+ }
+ if (ro0_u[i] != 0) {
++zero_cnt;
+ }
}
- if (!zero_cnt)
+ if (!zero_cnt) {
++err;
+ }
- free(orig_y);
- free(orig_uv);
- free(ro0_y);
- free(ro0_u);
- free(ro0_v);
- free(ro270_y);
- free(ro270_u);
- free(ro270_v);
+ free_aligned_buffer_16(orig_y)
+ free_aligned_buffer_16(orig_uv)
+ free_aligned_buffer_16(ro0_y)
+ free_aligned_buffer_16(ro0_u)
+ free_aligned_buffer_16(ro0_v)
+ free_aligned_buffer_16(ro270_y)
+ free_aligned_buffer_16(ro270_u)
+ free_aligned_buffer_16(ro270_v)
EXPECT_EQ(0, err);
}
TEST_F(libyuvTest, NV12ToI420Rotate180) {
int err = 0;
- uint8 *orig_y, *orig_uv;
- uint8 *ro0_y, *ro0_u, *ro0_v;
- uint8 *ro180_y, *ro180_u, *ro180_v;
int yw = 1024;
int yh = 768;
@@ -1164,43 +1168,49 @@ TEST_F(libyuvTest, NV12ToI420Rotate180) {
int i, j;
- int y_plane_size = (yw + (2 * b)) * (yh + (2 * b));
- int uv_plane_size = (uvw + (2 * b)) * (uvh + (2 * b));
- int o_uv_plane_size = ((2 * uvw) + (2 * b)) * (uvh + (2 * b));
+ int y_plane_size = (yw + b * 2) * (yh + b * 2);
+ int uv_plane_size = (uvw + b * 2) * (uvh + b * 2);
+ int o_uv_plane_size = (uvw * 2 + b * 2) * (uvh + b * 2);
srandom(time(NULL));
- orig_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
- orig_uv = static_cast<uint8*>(calloc(o_uv_plane_size, sizeof(uint8)));
-
- ro0_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
- ro0_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
- ro0_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-
- ro180_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
- ro180_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
- ro180_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
+ align_buffer_16(orig_y, y_plane_size)
+ align_buffer_16(orig_uv, o_uv_plane_size)
+ align_buffer_16(ro0_y, y_plane_size)
+ align_buffer_16(ro0_u, uv_plane_size)
+ align_buffer_16(ro0_v, uv_plane_size)
+ align_buffer_16(ro180_y, y_plane_size)
+ align_buffer_16(ro180_u, uv_plane_size)
+ align_buffer_16(ro180_v, uv_plane_size)
+ memset(orig_y, 0, y_plane_size);
+ memset(orig_uv, 0, o_uv_plane_size);
+ memset(ro0_y, 0, y_plane_size);
+ memset(ro0_u, 0, uv_plane_size);
+ memset(ro0_v, 0, uv_plane_size);
+ memset(ro180_y, 0, y_plane_size);
+ memset(ro180_u, 0, uv_plane_size);
+ memset(ro180_v, 0, uv_plane_size);
// fill image buffers with random data
for (i = b; i < (yh + b); ++i) {
for (j = b; j < (yw + b); ++j) {
- orig_y[i * (yw + (2 * b)) + j] = random() & 0xff;
+ orig_y[i * (yw + b * 2) + j] = random() & 0xff;
}
}
for (i = b; i < (uvh + b); ++i) {
- for (j = b; j < ((2 * uvw) + b); j += 2) {
+ for (j = b; j < (uvw * 2 + b); j += 2) {
uint8 random_number = random() & 0x7f;
- orig_uv[i * ((2 * uvw) + (2 * b)) + j] = random_number;
- orig_uv[i * ((2 * uvw) + (2 * b)) + j + 1] = -random_number;
+ orig_uv[i * (uvw * 2 + b * 2) + j] = random_number;
+ orig_uv[i * (uvw * 2 + b * 2) + j + 1] = -random_number;
}
}
- int y_off = b * (yw + (2 * b)) + b;
- int uv_off = b * (uvw + (2 * b)) + b;
+ int y_off = b * (yw + b * 2) + b;
+ int uv_off = b * (uvw + b * 2) + b;
- int y_st = yw + (2 * b);
- int uv_st = uvw + (2 * b);
+ int y_st = yw + b * 2;
+ int uv_st = uvw + b * 2;
NV12ToI420Rotate(orig_y+y_off, y_st,
orig_uv+y_off, y_st,
@@ -1220,40 +1230,40 @@ TEST_F(libyuvTest, NV12ToI420Rotate180) {
kRotate180);
for (i = 0; i < y_plane_size; ++i) {
- if (orig_y[i] != ro0_y[i])
+ if (orig_y[i] != ro0_y[i]) {
++err;
+ }
}
int zero_cnt = 0;
for (i = 0; i < uv_plane_size; ++i) {
- if ((signed char)ro0_u[i] != -(signed char)ro0_v[i])
+ if ((signed char)ro0_u[i] != -(signed char)ro0_v[i]) {
++err;
- if (ro0_u[i] != 0)
+ }
+ if (ro0_u[i] != 0) {
++zero_cnt;
+ }
}
- if (!zero_cnt)
+ if (!zero_cnt) {
++err;
+ }
- free(orig_y);
- free(orig_uv);
- free(ro0_y);
- free(ro0_u);
- free(ro0_v);
- free(ro180_y);
- free(ro180_u);
- free(ro180_v);
+ free_aligned_buffer_16(orig_y)
+ free_aligned_buffer_16(orig_uv)
+ free_aligned_buffer_16(ro0_y)
+ free_aligned_buffer_16(ro0_u)
+ free_aligned_buffer_16(ro0_v)
+ free_aligned_buffer_16(ro180_y)
+ free_aligned_buffer_16(ro180_u)
+ free_aligned_buffer_16(ro180_v)
EXPECT_EQ(0, err);
}
TEST_F(libyuvTest, NV12ToI420RotateNegHeight90) {
int y_err = 0, uv_err = 0;
- uint8 *orig_y, *orig_uv;
- uint8 *roa_y, *roa_u, *roa_v;
- uint8 *rob_y, *rob_u, *rob_v;
- uint8 *roc_y, *roc_u, *roc_v;
int yw = 1024;
int yh = 768;
@@ -1262,51 +1272,59 @@ TEST_F(libyuvTest, NV12ToI420RotateNegHeight90) {
int uvh = (yh + 1) >> 1;
int i, j;
- int y_plane_size = (yw + (2 * b)) * (yh + (2 * b));
- int uv_plane_size = (uvw + (2 * b)) * (uvh + (2 * b));
- int o_uv_plane_size = ((2 * uvw) + (2 * b)) * (uvh + (2 * b));
+ int y_plane_size = (yw + b * 2) * (yh + b * 2);
+ int uv_plane_size = (uvw + b * 2) * (uvh + b * 2);
+ int o_uv_plane_size = (uvw * 2 + b * 2) * (uvh + b * 2);
srandom(time(NULL));
- orig_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
- orig_uv = static_cast<uint8*>(calloc(o_uv_plane_size, sizeof(uint8)));
-
- roa_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
- roa_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
- roa_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-
- rob_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
- rob_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
- rob_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-
- roc_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
- roc_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
- roc_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
+ align_buffer_16(orig_y, y_plane_size)
+ align_buffer_16(orig_uv, o_uv_plane_size)
+ align_buffer_16(roa_y, y_plane_size)
+ align_buffer_16(roa_u, uv_plane_size)
+ align_buffer_16(roa_v, uv_plane_size)
+ align_buffer_16(rob_y, y_plane_size)
+ align_buffer_16(rob_u, uv_plane_size)
+ align_buffer_16(rob_v, uv_plane_size)
+ align_buffer_16(roc_y, y_plane_size)
+ align_buffer_16(roc_u, uv_plane_size)
+ align_buffer_16(roc_v, uv_plane_size)
+ memset(orig_y, 0, y_plane_size);
+ memset(orig_uv, 0, o_uv_plane_size);
+ memset(roa_y, 0, y_plane_size);
+ memset(roa_u, 0, uv_plane_size);
+ memset(roa_v, 0, uv_plane_size);
+ memset(rob_y, 0, y_plane_size);
+ memset(rob_u, 0, uv_plane_size);
+ memset(rob_v, 0, uv_plane_size);
+ memset(roc_y, 0, y_plane_size);
+ memset(roc_u, 0, uv_plane_size);
+ memset(roc_v, 0, uv_plane_size);
// fill image buffers with random data
for (i = b; i < (yh + b); ++i) {
for (j = b; j < (yw + b); ++j) {
- orig_y[i * (yw + (2 * b)) + j] = random() & 0xff;
+ orig_y[i * (yw + b * 2) + j] = random() & 0xff;
}
}
for (i = b; i < (uvh + b); ++i) {
- for (j = b; j < ((2 * uvw) + b); j += 2) {
+ for (j = b; j < (uvw * 2 + b); j += 2) {
uint8 random_number = random() & 0x7f;
- orig_uv[i * ((2 * uvw) + (2 * b)) + j] = random_number;
- orig_uv[i * ((2 * uvw) + (2 * b)) + j + 1] = -random_number;
+ orig_uv[i * (uvw * 2 + b * 2) + j] = random_number;
+ orig_uv[i * (uvw * 2 + b * 2) + j + 1] = -random_number;
}
}
- int y_off_0 = b * (yw + (2 * b)) + b;
- int uv_off_0 = b * (uvw + (2 * b)) + b;
- int y_off_90 = b * (yh + (2 * b)) + b;
- int uv_off_90 = b * (uvh + (2 * b)) + b;
+ int y_off_0 = b * (yw + b * 2) + b;
+ int uv_off_0 = b * (uvw + b * 2) + b;
+ int y_off_90 = b * (yh + b * 2) + b;
+ int uv_off_90 = b * (uvh + b * 2) + b;
- int y_st_0 = yw + (2 * b);
- int uv_st_0 = uvw + (2 * b);
- int y_st_90 = yh + (2 * b);
- int uv_st_90 = uvh + (2 * b);
+ int y_st_0 = yw + b * 2;
+ int uv_st_0 = uvw + b * 2;
+ int y_st_90 = yh + b * 2;
+ int uv_st_90 = uvh + b * 2;
NV12ToI420Rotate(orig_y+y_off_0, y_st_0,
orig_uv+y_off_0, y_st_0,
@@ -1335,73 +1353,74 @@ TEST_F(libyuvTest, NV12ToI420RotateNegHeight90) {
kRotate180);
for (i = 0; i < y_plane_size; ++i) {
- if (orig_y[i] != roc_y[i])
+ if (orig_y[i] != roc_y[i]) {
++y_err;
+ }
}
if (y_err) {
printf("input %dx%d \n", yw, yh);
- print_array(orig_y, y_st_0, yh + (2 * b));
+ PrintArray(orig_y, y_st_0, yh + b * 2);
printf("rotate a\n");
- print_array(roa_y, y_st_90, y_st_0);
+ PrintArray(roa_y, y_st_90, y_st_0);
printf("rotate b\n");
- print_array(rob_y, y_st_90, y_st_0);
+ PrintArray(rob_y, y_st_90, y_st_0);
printf("rotate c\n");
- print_array(roc_y, y_st_0, y_st_90);
+ PrintArray(roc_y, y_st_0, y_st_90);
}
int zero_cnt = 0;
for (i = 0; i < uv_plane_size; ++i) {
- if ((signed char)roc_u[i] != -(signed char)roc_v[i])
+ if ((signed char)roc_u[i] != -(signed char)roc_v[i]) {
++uv_err;
- if (rob_u[i] != 0)
+ }
+ if (rob_u[i] != 0) {
++zero_cnt;
+ }
}
- if (!zero_cnt)
+ if (!zero_cnt) {
++uv_err;
+ }
if (uv_err) {
- printf("input %dx%d \n", (2 * uvw), uvh);
- print_array(orig_uv, y_st_0, uvh + (2 * b));
+ printf("input %dx%d \n", uvw * 2, uvh);
+ PrintArray(orig_uv, y_st_0, uvh + b * 2);
printf("rotate a\n");
- print_array(roa_u, uv_st_90, uv_st_0);
- print_array(roa_v, uv_st_90, uv_st_0);
+ PrintArray(roa_u, uv_st_90, uv_st_0);
+ PrintArray(roa_v, uv_st_90, uv_st_0);
printf("rotate b\n");
- print_array(rob_u, uv_st_90, uv_st_0);
- print_array(rob_v, uv_st_90, uv_st_0);
+ PrintArray(rob_u, uv_st_90, uv_st_0);
+ PrintArray(rob_v, uv_st_90, uv_st_0);
printf("rotate c\n");
- print_array(roc_u, uv_st_0, uv_st_90);
- print_array(roc_v, uv_st_0, uv_st_90);
+ PrintArray(roc_u, uv_st_0, uv_st_90);
+ PrintArray(roc_v, uv_st_0, uv_st_90);
}
- free(orig_y);
- free(orig_uv);
- free(roa_y);
- free(roa_u);
- free(roa_v);
- free(rob_y);
- free(rob_u);
- free(rob_v);
- free(roc_y);
- free(roc_u);
- free(roc_v);
+ free_aligned_buffer_16(orig_y)
+ free_aligned_buffer_16(orig_uv)
+ free_aligned_buffer_16(roa_y)
+ free_aligned_buffer_16(roa_u)
+ free_aligned_buffer_16(roa_v)
+ free_aligned_buffer_16(rob_y)
+ free_aligned_buffer_16(rob_u)
+ free_aligned_buffer_16(rob_v)
+ free_aligned_buffer_16(roc_y)
+ free_aligned_buffer_16(roc_u)
+ free_aligned_buffer_16(roc_v)
EXPECT_EQ(0, y_err + uv_err);
}
TEST_F(libyuvTest, NV12ToI420RotateNegHeight180) {
int y_err = 0, uv_err = 0;
- uint8 *orig_y, *orig_uv;
- uint8 *roa_y, *roa_u, *roa_v;
- uint8 *rob_y, *rob_u, *rob_v;
int yw = 1024;
int yh = 768;
@@ -1410,43 +1429,49 @@ TEST_F(libyuvTest, NV12ToI420RotateNegHeight180) {
int uvh = (yh + 1) >> 1;
int i, j;
- int y_plane_size = (yw + (2 * b)) * (yh + (2 * b));
- int uv_plane_size = (uvw + (2 * b)) * (uvh + (2 * b));
- int o_uv_plane_size = ((2 * uvw) + (2 * b)) * (uvh + (2 * b));
+ int y_plane_size = (yw + b * 2) * (yh + b * 2);
+ int uv_plane_size = (uvw + b * 2) * (uvh + b * 2);
+ int o_uv_plane_size = (uvw * 2 + b * 2) * (uvh + b * 2);
srandom(time(NULL));
- orig_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
- orig_uv = static_cast<uint8*>(calloc(o_uv_plane_size, sizeof(uint8)));
-
- roa_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
- roa_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
- roa_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-
- rob_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
- rob_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
- rob_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
+ align_buffer_16(orig_y, y_plane_size)
+ align_buffer_16(orig_uv, o_uv_plane_size)
+ align_buffer_16(roa_y, y_plane_size)
+ align_buffer_16(roa_u, uv_plane_size)
+ align_buffer_16(roa_v, uv_plane_size)
+ align_buffer_16(rob_y, y_plane_size)
+ align_buffer_16(rob_u, uv_plane_size)
+ align_buffer_16(rob_v, uv_plane_size)
+ memset(orig_y, 0, y_plane_size);
+ memset(orig_uv, 0, o_uv_plane_size);
+ memset(roa_y, 0, y_plane_size);
+ memset(roa_u, 0, uv_plane_size);
+ memset(roa_v, 0, uv_plane_size);
+ memset(rob_y, 0, y_plane_size);
+ memset(rob_u, 0, uv_plane_size);
+ memset(rob_v, 0, uv_plane_size);
// fill image buffers with random data
for (i = b; i < (yh + b); ++i) {
for (j = b; j < (yw + b); ++j) {
- orig_y[i * (yw + (2 * b)) + j] = random() & 0xff;
+ orig_y[i * (yw + b * 2) + j] = random() & 0xff;
}
}
for (i = b; i < (uvh + b); ++i) {
- for (j = b; j < ((2 * uvw) + b); j += 2) {
+ for (j = b; j < (uvw * 2 + b); j += 2) {
uint8 random_number = random() & 0x7f;
- orig_uv[i * ((2 * uvw) + (2 * b)) + j] = random_number;
- orig_uv[i * ((2 * uvw) + (2 * b)) + j + 1] = -random_number;
+ orig_uv[i * (uvw * 2 + b * 2) + j] = random_number;
+ orig_uv[i * (uvw * 2 + b * 2) + j + 1] = -random_number;
}
}
- int y_off = b * (yw + (2 * b)) + b;
- int uv_off = b * (uvw + (2 * b)) + b;
+ int y_off = b * (yw + b * 2) + b;
+ int uv_off = b * (uvw + b * 2) + b;
- int y_st = yw + (2 * b);
- int uv_st = uvw + (2 * b);
+ int y_st = yw + b * 2;
+ int uv_st = uvw + b * 2;
NV12ToI420Rotate(orig_y+y_off, y_st,
orig_uv+y_off, y_st,
@@ -1472,48 +1497,53 @@ TEST_F(libyuvTest, NV12ToI420RotateNegHeight180) {
if (y_err) {
printf("input %dx%d \n", yw, yh);
- print_array(orig_y, y_st, yh + (2 * b));
+ PrintArray(orig_y, y_st, yh + b * 2);
printf("rotate a\n");
- print_array(roa_y, y_st, yh + (2 * b));
+ PrintArray(roa_y, y_st, yh + b * 2);
printf("rotate b\n");
- print_array(rob_y, y_st, yh + (2 * b));
+ PrintArray(rob_y, y_st, yh + b * 2);
}
int zero_cnt = 0;
for (i = 0; i < uv_plane_size; ++i) {
- if ((signed char)rob_u[i] != -(signed char)rob_v[i])
+ if ((signed char)rob_u[i] != -(signed char)rob_v[i]) {
++uv_err;
- if (rob_u[i] != 0)
+ }
+ if (rob_u[i] != 0) {
++zero_cnt;
+ }
}
- if (!zero_cnt)
+ if (!zero_cnt) {
++uv_err;
+ }
if (uv_err) {
- printf("input %dx%d \n", (2 * uvw), uvh);
- print_array(orig_uv, y_st, uvh + (2 * b));
+ printf("input %dx%d \n", uvw * 2, uvh);
+ PrintArray(orig_uv, y_st, uvh + b * 2);
printf("rotate a\n");
- print_array(roa_u, uv_st, uvh + (2 * b));
- print_array(roa_v, uv_st, uvh + (2 * b));
+ PrintArray(roa_u, uv_st, uvh + b * 2);
+ PrintArray(roa_v, uv_st, uvh + b * 2);
printf("rotate b\n");
- print_array(rob_u, uv_st, uvh + (2 * b));
- print_array(rob_v, uv_st, uvh + (2 * b));
+ PrintArray(rob_u, uv_st, uvh + b * 2);
+ PrintArray(rob_v, uv_st, uvh + b * 2);
}
- free(orig_y);
- free(orig_uv);
- free(roa_y);
- free(roa_u);
- free(roa_v);
- free(rob_y);
- free(rob_u);
- free(rob_v);
+ free_aligned_buffer_16(orig_y)
+ free_aligned_buffer_16(orig_uv)
+ free_aligned_buffer_16(roa_y)
+ free_aligned_buffer_16(roa_u)
+ free_aligned_buffer_16(roa_v)
+ free_aligned_buffer_16(rob_y)
+ free_aligned_buffer_16(rob_u)
+ free_aligned_buffer_16(rob_v)
EXPECT_EQ(0, y_err + uv_err);
}
+
+} // namespace libyuv
diff --git a/files/unit_test/scale_argb_test.cc b/files/unit_test/scale_argb_test.cc
new file mode 100644
index 00000000..fef96764
--- /dev/null
+++ b/files/unit_test/scale_argb_test.cc
@@ -0,0 +1,255 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <time.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/scale_argb.h"
+#include "../unit_test/unit_test.h"
+
+namespace libyuv {
+
+static int ARGBTestFilter(int src_width, int src_height,
+ int dst_width, int dst_height,
+ FilterMode f, int benchmark_iterations) {
+ const int b = 128;
+ int src_argb_plane_size = (src_width + b * 2) * (src_height + b * 2) * 4;
+ int src_stride_argb = (b * 2 + src_width) * 4;
+
+ align_buffer_16(src_argb, src_argb_plane_size)
+ memset(src_argb, 1, src_argb_plane_size);
+
+ int dst_argb_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 4;
+ int dst_stride_argb = (b * 2 + dst_width) * 4;
+
+ srandom(time(NULL));
+
+ int i, j;
+ for (i = b; i < (src_height + b); ++i) {
+ for (j = b; j < (src_width + b) * 4; ++j) {
+ src_argb[(i * src_stride_argb) + j] = (random() & 0xff);
+ }
+ }
+
+ align_buffer_16(dst_argb_c, dst_argb_plane_size)
+ align_buffer_16(dst_argb_opt, dst_argb_plane_size)
+ memset(dst_argb_c, 2, dst_argb_plane_size);
+ memset(dst_argb_opt, 3, dst_argb_plane_size);
+
+ // Warm up both versions for consistent benchmarks.
+ MaskCpuFlags(0); // Disable all CPU optimization.
+ ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+ src_width, src_height,
+ dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+ dst_width, dst_height, f);
+ MaskCpuFlags(-1); // Enable all CPU optimization.
+ ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+ src_width, src_height,
+ dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+ dst_width, dst_height, f);
+
+ MaskCpuFlags(0); // Disable all CPU optimization.
+ double c_time = get_time();
+ for (i = 0; i < benchmark_iterations; ++i) {
+ ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+ src_width, src_height,
+ dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+ dst_width, dst_height, f);
+ }
+ c_time = (get_time() - c_time) / benchmark_iterations;
+
+ MaskCpuFlags(-1); // Enable all CPU optimization.
+ double opt_time = get_time();
+ for (i = 0; i < benchmark_iterations; ++i) {
+ ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+ src_width, src_height,
+ dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+ dst_width, dst_height, f);
+ }
+ opt_time = (get_time() - opt_time) / benchmark_iterations;
+
+ // Report performance of C vs OPT
+ printf("filter %d - %8d us C - %8d us OPT\n",
+ f, static_cast<int>(c_time*1e6), static_cast<int>(opt_time*1e6));
+
+ // C version may be a little off from the optimized. Order of
+ // operations may introduce rounding somewhere. So do a difference
+ // of the buffers and look to see that the max difference isn't
+ // over 2.
+ int max_diff = 0;
+ for (i = b; i < (dst_height + b); ++i) {
+ for (j = b * 4; j < (dst_width + b) * 4; ++j) {
+ int abs_diff = abs(dst_argb_c[(i * dst_stride_argb) + j] -
+ dst_argb_opt[(i * dst_stride_argb) + j]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ }
+
+ free_aligned_buffer_16(dst_argb_c)
+ free_aligned_buffer_16(dst_argb_opt)
+ free_aligned_buffer_16(src_argb)
+ return max_diff;
+}
+
+TEST_F(libyuvTest, ARGBScaleDownBy2) {
+ const int src_width = 1280;
+ const int src_height = 720;
+ const int dst_width = src_width / 2;
+ const int dst_height = src_height / 2;
+
+ for (int f = 0; f < 2; ++f) {
+ int max_diff = ARGBTestFilter(src_width, src_height,
+ dst_width, dst_height,
+ static_cast<FilterMode>(f),
+ benchmark_iterations_);
+ EXPECT_LE(max_diff, 1);
+ }
+}
+
+TEST_F(libyuvTest, ARGBScaleDownBy4) {
+ const int src_width = 1280;
+ const int src_height = 720;
+ const int dst_width = src_width / 4;
+ const int dst_height = src_height / 4;
+
+ for (int f = 0; f < 2; ++f) {
+ int max_diff = ARGBTestFilter(src_width, src_height,
+ dst_width, dst_height,
+ static_cast<FilterMode>(f),
+ benchmark_iterations_);
+ EXPECT_LE(max_diff, 1);
+ }
+}
+
+TEST_F(libyuvTest, ARGBScaleDownBy5) {
+ const int src_width = 1280;
+ const int src_height = 720;
+ const int dst_width = src_width / 5;
+ const int dst_height = src_height / 5;
+
+ for (int f = 0; f < 2; ++f) {
+ int max_diff = ARGBTestFilter(src_width, src_height,
+ dst_width, dst_height,
+ static_cast<FilterMode>(f),
+ benchmark_iterations_);
+ EXPECT_LE(max_diff, 1);
+ }
+}
+
+TEST_F(libyuvTest, ARGBScaleDownBy8) {
+ const int src_width = 1280;
+ const int src_height = 720;
+ const int dst_width = src_width / 8;
+ const int dst_height = src_height / 8;
+
+ for (int f = 0; f < 2; ++f) {
+ int max_diff = ARGBTestFilter(src_width, src_height,
+ dst_width, dst_height,
+ static_cast<FilterMode>(f),
+ benchmark_iterations_);
+ EXPECT_LE(max_diff, 1);
+ }
+}
+
+TEST_F(libyuvTest, ARGBScaleDownBy16) {
+ const int src_width = 1280;
+ const int src_height = 720;
+ const int dst_width = src_width / 16;
+ const int dst_height = src_height / 16;
+
+ for (int f = 0; f < 2; ++f) {
+ int max_diff = ARGBTestFilter(src_width, src_height,
+ dst_width, dst_height,
+ static_cast<FilterMode>(f),
+ benchmark_iterations_);
+ EXPECT_LE(max_diff, 1);
+ }
+}
+
+TEST_F(libyuvTest, ARGBScaleDownBy34) {
+ const int src_width = 1280;
+ const int src_height = 720;
+ const int dst_width = src_width * 3 / 4;
+ const int dst_height = src_height * 3 / 4;
+
+ for (int f = 0; f < 2; ++f) {
+ int max_diff = ARGBTestFilter(src_width, src_height,
+ dst_width, dst_height,
+ static_cast<FilterMode>(f),
+ benchmark_iterations_);
+ EXPECT_LE(max_diff, 1);
+ }
+}
+
+TEST_F(libyuvTest, ARGBScaleDownBy38) {
+ int src_width = 1280;
+ int src_height = 720;
+ int dst_width = src_width * 3 / 8;
+ int dst_height = src_height * 3 / 8;
+
+ for (int f = 0; f < 2; ++f) {
+ int max_diff = ARGBTestFilter(src_width, src_height,
+ dst_width, dst_height,
+ static_cast<FilterMode>(f),
+ benchmark_iterations_);
+ EXPECT_LE(max_diff, 1);
+ }
+}
+
+TEST_F(libyuvTest, ARGBScaleTo1366) {
+ int src_width = 1280;
+ int src_height = 720;
+ int dst_width = 1366;
+ int dst_height = 768;
+
+ for (int f = 0; f < 2; ++f) {
+ int max_diff = ARGBTestFilter(src_width, src_height,
+ dst_width, dst_height,
+ static_cast<FilterMode>(f),
+ benchmark_iterations_);
+ EXPECT_LE(max_diff, 1);
+ }
+}
+
+TEST_F(libyuvTest, ARGBScaleTo4074) {
+ int src_width = 2880 * 2;
+ int src_height = 1800;
+ int dst_width = 4074;
+ int dst_height = 1272;
+
+ for (int f = 0; f < 2; ++f) {
+ int max_diff = ARGBTestFilter(src_width, src_height,
+ dst_width, dst_height,
+ static_cast<FilterMode>(f),
+ benchmark_iterations_);
+ EXPECT_LE(max_diff, 1);
+ }
+}
+
+
+TEST_F(libyuvTest, ARGBScaleTo853) {
+ int src_width = 1280;
+ int src_height = 720;
+ int dst_width = 853;
+ int dst_height = 480;
+
+ for (int f = 0; f < 2; ++f) {
+ int max_diff = ARGBTestFilter(src_width, src_height,
+ dst_width, dst_height,
+ static_cast<FilterMode>(f),
+ benchmark_iterations_);
+ EXPECT_LE(max_diff, 1);
+ }
+}
+
+} // namespace libyuv
diff --git a/files/unit_test/scale_test.cc b/files/unit_test/scale_test.cc
index e147d78b..55b4148d 100644
--- a/files/unit_test/scale_test.cc
+++ b/files/unit_test/scale_test.cc
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@@ -8,152 +8,369 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "libyuv/scale.h"
-#include "unit_test.h"
#include <stdlib.h>
#include <time.h>
-using namespace libyuv;
-
-#define align_buffer_16(var, size) \
- uint8 *var; \
- uint8 *var##_mem; \
- var##_mem = reinterpret_cast<uint8*>(calloc(size+15, sizeof(uint8))); \
- var = reinterpret_cast<uint8*> \
- ((reinterpret_cast<intptr_t>(var##_mem) + 15) & (~0x0f));
-
-#define free_aligned_buffer_16(var) \
- free(var##_mem); \
- var = 0;
-
-TEST_F(libyuvTest, ScaleDownBy4) {
- int b = 128;
- int src_width = 1280;
- int src_height = 720;
- int src_width_uv = (src_width + 1) >> 1;
- int src_height_uv = (src_height + 1) >> 1;
+#include "libyuv/cpu_id.h"
+#include "libyuv/scale.h"
+#include "../unit_test/unit_test.h"
- int src_y_plane_size = (src_width + (2 * b)) * (src_height + (2 * b));
- int src_uv_plane_size = (src_width_uv + (2 * b)) * (src_height_uv + (2 * b));
+namespace libyuv {
- int src_stride_y = 2 * b + src_width;
- int src_stride_uv = 2 * b + src_width_uv;
+static int TestFilter(int src_width, int src_height,
+ int dst_width, int dst_height,
+ FilterMode f, int rounding, int benchmark_iterations) {
+ const int b = 128 * rounding;
+ int src_width_uv = (src_width + rounding) >> 1;
+ int src_height_uv = (src_height + rounding) >> 1;
- align_buffer_16(src_y, src_y_plane_size)
- align_buffer_16(src_u, src_uv_plane_size)
- align_buffer_16(src_v, src_uv_plane_size)
+ int src_y_plane_size = (src_width + b * 2) * (src_height + b * 2);
+ int src_uv_plane_size = (src_width_uv + b * 2) * (src_height_uv + b * 2);
- int dst_width = src_width >> 2;
- int dst_height = src_height >> 2;
+ int src_stride_y = b * 2 + src_width;
+ int src_stride_uv = b * 2 + src_width_uv;
- int dst_width_uv = (dst_width + 1) >> 1;
- int dst_height_uv = (dst_height + 1) >> 1;
+ align_buffer_page_end(src_y, src_y_plane_size)
+ align_buffer_page_end(src_u, src_uv_plane_size)
+ align_buffer_page_end(src_v, src_uv_plane_size)
- int dst_y_plane_size = (dst_width + (2 * b)) * (dst_height + (2 * b));
- int dst_uv_plane_size = (dst_width_uv + (2 * b)) * (dst_height_uv + (2 * b));
+ int dst_width_uv = (dst_width + rounding) >> 1;
+ int dst_height_uv = (dst_height + rounding) >> 1;
- int dst_stride_y = 2 * b + dst_width;
- int dst_stride_uv = 2 * b + dst_width_uv;
+ int dst_y_plane_size = (dst_width + b * 2) * (dst_height + b * 2);
+ int dst_uv_plane_size = (dst_width_uv + b * 2) * (dst_height_uv + b * 2);
- align_buffer_16(dst_y, dst_y_plane_size)
- align_buffer_16(dst_u, dst_uv_plane_size)
- align_buffer_16(dst_v, dst_uv_plane_size)
+ int dst_stride_y = b * 2 + dst_width;
+ int dst_stride_uv = b * 2 + dst_width_uv;
- // create an image with random data reoccurring in 4x4 grid. When the image
- // is filtered all the values should be the same.
srandom(time(NULL));
- uint8 block_data[16];
-
int i, j;
-
- // Pulling 16 random numbers there is an infinitesimally small
- // chance that they are all 0. Then the output will be all 0.
- // Output buffer is filled with 0, want to make sure that after the
- // filtering something went into the output buffer.
- // Avoid this by setting one of the values to 128. Also set the
- // random data to at least 1 for when point sampling to prevent
- // output all being 0.
- block_data[0] = 128;
-
- for (i = 1; i < 16; i++)
- block_data[i] = (random() & 0xfe) + 1;
-
- for (i = b; i < (src_height + b); i += 4) {
- for (j = b; j < (src_width + b); j += 4) {
- uint8 *ptr = src_y + (i * src_stride_y) + j;
- int k, l;
- for (k = 0; k < 4; ++k)
- for (l = 0; l < 4; ++l)
- ptr[k + src_stride_y * l] = block_data[k + 4 * l];
+ for (i = b; i < (src_height + b); ++i) {
+ for (j = b; j < (src_width + b); ++j) {
+ src_y[(i * src_stride_y) + j] = (random() & 0xff);
}
}
- for (i = 1; i < 16; i++)
- block_data[i] = (random() & 0xfe) + 1;
-
- for (i = b; i < (src_height_uv + b); i += 4) {
- for (j = b; j < (src_width_uv + b); j += 4) {
- uint8 *ptru = src_u + (i * src_stride_uv) + j;
- uint8 *ptrv = src_v + (i * src_stride_uv) + j;
- int k, l;
- for (k = 0; k < 4; ++k)
- for (l = 0; l < 4; ++l) {
- ptru[k + src_stride_uv * l] = block_data[k + 4 * l];
- ptrv[k + src_stride_uv * l] = block_data[k + 4 * l];
- }
+ for (i = b; i < (src_height_uv + b); ++i) {
+ for (j = b; j < (src_width_uv + b); ++j) {
+ src_u[(i * src_stride_uv) + j] = (random() & 0xff);
+ src_v[(i * src_stride_uv) + j] = (random() & 0xff);
}
}
- int f;
- int err = 0;
+ align_buffer_page_end(dst_y_c, dst_y_plane_size)
+ align_buffer_page_end(dst_u_c, dst_uv_plane_size)
+ align_buffer_page_end(dst_v_c, dst_uv_plane_size)
+ align_buffer_page_end(dst_y_opt, dst_y_plane_size)
+ align_buffer_page_end(dst_u_opt, dst_uv_plane_size)
+ align_buffer_page_end(dst_v_opt, dst_uv_plane_size)
+
+ // Warm up both versions for consistent benchmarks.
+ MaskCpuFlags(0); // Disable all CPU optimization.
+ I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
+ src_u + (src_stride_uv * b) + b, src_stride_uv,
+ src_v + (src_stride_uv * b) + b, src_stride_uv,
+ src_width, src_height,
+ dst_y_c + (dst_stride_y * b) + b, dst_stride_y,
+ dst_u_c + (dst_stride_uv * b) + b, dst_stride_uv,
+ dst_v_c + (dst_stride_uv * b) + b, dst_stride_uv,
+ dst_width, dst_height, f);
+ MaskCpuFlags(-1); // Enable all CPU optimization.
+ I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
+ src_u + (src_stride_uv * b) + b, src_stride_uv,
+ src_v + (src_stride_uv * b) + b, src_stride_uv,
+ src_width, src_height,
+ dst_y_opt + (dst_stride_y * b) + b, dst_stride_y,
+ dst_u_opt + (dst_stride_uv * b) + b, dst_stride_uv,
+ dst_v_opt + (dst_stride_uv * b) + b, dst_stride_uv,
+ dst_width, dst_height, f);
- // currently three filter modes, defined as FilterMode in scale.h
- for (f = 0; f < 3; ++f) {
+ MaskCpuFlags(0); // Disable all CPU optimization.
+ double c_time = get_time();
+ for (i = 0; i < benchmark_iterations; ++i) {
I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
src_u + (src_stride_uv * b) + b, src_stride_uv,
src_v + (src_stride_uv * b) + b, src_stride_uv,
src_width, src_height,
- dst_y + (dst_stride_y * b) + b, dst_stride_y,
- dst_u + (dst_stride_uv * b) + b, dst_stride_uv,
- dst_v + (dst_stride_uv * b) + b, dst_stride_uv,
- dst_width, dst_height,
- static_cast<FilterMode>(f));
-
- int value = dst_y[(dst_stride_y * b) + b];
-
- // catch the case that the output buffer is all 0
- if (value == 0)
- ++err;
-
- for (i = b; i < (dst_height + b); ++i) {
- for (j = b; j < (dst_width + b); ++j) {
- if (value != dst_y[(i * dst_stride_y) + j])
- ++err;
- }
- }
+ dst_y_c + (dst_stride_y * b) + b, dst_stride_y,
+ dst_u_c + (dst_stride_uv * b) + b, dst_stride_uv,
+ dst_v_c + (dst_stride_uv * b) + b, dst_stride_uv,
+ dst_width, dst_height, f);
+ }
+ c_time = (get_time() - c_time) / benchmark_iterations;
- value = dst_u[(dst_stride_uv * b) + b];
+ MaskCpuFlags(-1); // Enable all CPU optimization.
+ double opt_time = get_time();
+ for (i = 0; i < benchmark_iterations; ++i) {
+ I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
+ src_u + (src_stride_uv * b) + b, src_stride_uv,
+ src_v + (src_stride_uv * b) + b, src_stride_uv,
+ src_width, src_height,
+ dst_y_opt + (dst_stride_y * b) + b, dst_stride_y,
+ dst_u_opt + (dst_stride_uv * b) + b, dst_stride_uv,
+ dst_v_opt + (dst_stride_uv * b) + b, dst_stride_uv,
+ dst_width, dst_height, f);
+ }
+ opt_time = (get_time() - opt_time) / benchmark_iterations;
+
+ // Report performance of C vs OPT
+ printf("filter %d - %8d us C - %8d us OPT\n",
+ f, static_cast<int>(c_time*1e6), static_cast<int>(opt_time*1e6));
- if (value == 0)
- ++err;
+ // C version may be a little off from the optimized. Order of
+ // operations may introduce rounding somewhere. So do a difference
+ // of the buffers and look to see that the max difference isn't
+ // over 2.
+ int max_diff = 0;
+ for (i = b; i < (dst_height + b); ++i) {
+ for (j = b; j < (dst_width + b); ++j) {
+ int abs_diff = abs(dst_y_c[(i * dst_stride_y) + j] -
+ dst_y_opt[(i * dst_stride_y) + j]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ }
- for (i = b; i < (dst_height_uv + b); ++i) {
- for (j = b; j < (dst_width_uv + b); ++j) {
- if (value != dst_u[(i * dst_stride_uv) + j])
- ++err;
- if (value != dst_v[(i * dst_stride_uv) + j])
- ++err;
+ for (i = b; i < (dst_height_uv + b); ++i) {
+ for (j = b; j < (dst_width_uv + b); ++j) {
+ int abs_diff = abs(dst_u_c[(i * dst_stride_uv) + j] -
+ dst_u_opt[(i * dst_stride_uv) + j]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ abs_diff = abs(dst_v_c[(i * dst_stride_uv) + j] -
+ dst_v_opt[(i * dst_stride_uv) + j]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
}
}
}
- free_aligned_buffer_16(src_y)
- free_aligned_buffer_16(src_u)
- free_aligned_buffer_16(src_v)
- free_aligned_buffer_16(dst_y)
- free_aligned_buffer_16(dst_u)
- free_aligned_buffer_16(dst_v)
+ free_aligned_buffer_page_end(dst_y_c)
+ free_aligned_buffer_page_end(dst_u_c)
+ free_aligned_buffer_page_end(dst_v_c)
+ free_aligned_buffer_page_end(dst_y_opt)
+ free_aligned_buffer_page_end(dst_u_opt)
+ free_aligned_buffer_page_end(dst_v_opt)
+
+ free_aligned_buffer_page_end(src_y)
+ free_aligned_buffer_page_end(src_u)
+ free_aligned_buffer_page_end(src_v)
+
+ return max_diff;
+}
+
+TEST_F(libyuvTest, ScaleDownBy2) {
+ const int src_width = 1280;
+ const int src_height = 720;
+ const int dst_width = src_width / 2;
+ const int dst_height = src_height / 2;
+
+ for (int f = 0; f < 3; ++f) {
+ int max_diff = TestFilter(src_width, src_height,
+ dst_width, dst_height,
+ static_cast<FilterMode>(f), 1,
+ benchmark_iterations_);
+ EXPECT_LE(max_diff, 1);
+ }
+}
+
+TEST_F(libyuvTest, ScaleDownBy4) {
+ const int src_width = 1280;
+ const int src_height = 720;
+ const int dst_width = src_width / 4;
+ const int dst_height = src_height / 4;
+
+ for (int f = 0; f < 3; ++f) {
+ int max_diff = TestFilter(src_width, src_height,
+ dst_width, dst_height,
+ static_cast<FilterMode>(f), 1,
+ benchmark_iterations_);
+ EXPECT_LE(max_diff, 2); // This is the only scale factor with error of 2.
+ }
+}
+
+TEST_F(libyuvTest, ScaleDownBy5) {
+ const int src_width = 1280;
+ const int src_height = 720;
+ const int dst_width = src_width / 5;
+ const int dst_height = src_height / 5;
+
+ for (int f = 0; f < 3; ++f) {
+ int max_diff = TestFilter(src_width, src_height,
+ dst_width, dst_height,
+ static_cast<FilterMode>(f), 1,
+ benchmark_iterations_);
+ EXPECT_LE(max_diff, 1);
+ }
+}
+
+TEST_F(libyuvTest, ScaleDownBy8) {
+ const int src_width = 1280;
+ const int src_height = 720;
+ const int dst_width = src_width / 8;
+ const int dst_height = src_height / 8;
+
+ for (int f = 0; f < 3; ++f) {
+ int max_diff = TestFilter(src_width, src_height,
+ dst_width, dst_height,
+ static_cast<FilterMode>(f), 1,
+ benchmark_iterations_);
+ EXPECT_LE(max_diff, 1);
+ }
+}
+
+TEST_F(libyuvTest, ScaleDownBy16) {
+ const int src_width = 1280;
+ const int src_height = 720;
+ const int dst_width = src_width / 16;
+ const int dst_height = src_height / 16;
+
+ for (int f = 0; f < 3; ++f) {
+ int max_diff = TestFilter(src_width, src_height,
+ dst_width, dst_height,
+ static_cast<FilterMode>(f), 1,
+ benchmark_iterations_);
+ EXPECT_LE(max_diff, 1);
+ }
+}
+
+TEST_F(libyuvTest, ScaleDownBy34) {
+ const int src_width = 1280;
+ const int src_height = 720;
+ const int dst_width = src_width * 3 / 4;
+ const int dst_height = src_height * 3 / 4;
+
+ for (int f = 0; f < 3; ++f) {
+ int max_diff = TestFilter(src_width, src_height,
+ dst_width, dst_height,
+ static_cast<FilterMode>(f), 1,
+ benchmark_iterations_);
+ EXPECT_LE(max_diff, 1);
+ }
+}
+
+TEST_F(libyuvTest, ScaleDownBy38) {
+ int src_width = 1280;
+ int src_height = 720;
+ int dst_width = src_width * 3 / 8;
+ int dst_height = src_height * 3 / 8;
+
+ for (int f = 0; f < 3; ++f) {
+ int max_diff = TestFilter(src_width, src_height,
+ dst_width, dst_height,
+ static_cast<FilterMode>(f), 1,
+ benchmark_iterations_);
+ EXPECT_LE(max_diff, 1);
+ }
+}
+
+TEST_F(libyuvTest, ScaleTo1366) {
+ int src_width = 1280;
+ int src_height = 720;
+ int dst_width = 1366;
+ int dst_height = 768;
+
+ for (int f = 0; f < 3; ++f) {
+ int max_diff = TestFilter(src_width, src_height,
+ dst_width, dst_height,
+ static_cast<FilterMode>(f), 1,
+ benchmark_iterations_);
+ EXPECT_LE(max_diff, 1);
+ }
+}
+
+TEST_F(libyuvTest, ScaleTo4074) {
+ int src_width = 2880 * 2;
+ int src_height = 1800;
+ int dst_width = 4074;
+ int dst_height = 1272;
+
+ for (int f = 0; f < 3; ++f) {
+ int max_diff = TestFilter(src_width, src_height,
+ dst_width, dst_height,
+ static_cast<FilterMode>(f), 1,
+ benchmark_iterations_);
+ EXPECT_LE(max_diff, 1);
+ }
+}
+
+TEST_F(libyuvTest, ScaleTo853) {
+ int src_width = 1280;
+ int src_height = 720;
+ int dst_width = 853;
+ int dst_height = 480;
+
+ for (int f = 0; f < 3; ++f) {
+ int max_diff = TestFilter(src_width, src_height,
+ dst_width, dst_height,
+ static_cast<FilterMode>(f), 1,
+ benchmark_iterations_);
+ EXPECT_LE(max_diff, 1);
+ }
+}
+
+TEST_F(libyuvTest, ScaleTo853Wrong) {
+ int src_width = 1280;
+ int src_height = 720;
+ int dst_width = 853;
+ int dst_height = 480;
- EXPECT_EQ(0, err);
+ for (int f = 0; f < 3; ++f) {
+ int max_diff = TestFilter(src_width, src_height,
+ dst_width, dst_height,
+ static_cast<FilterMode>(f), 0,
+ benchmark_iterations_);
+ EXPECT_LE(max_diff, 1);
+ }
}
+
+// A one off test for a screen cast resolution scale.
+TEST_F(libyuvTest, ScaleTo684) {
+ int src_width = 686;
+ int src_height = 557;
+ int dst_width = 684;
+ int dst_height = 552;
+
+ for (int f = 0; f < 3; ++f) {
+ int max_diff = TestFilter(src_width, src_height,
+ dst_width, dst_height,
+ static_cast<FilterMode>(f), 1,
+ benchmark_iterations_);
+ EXPECT_LE(max_diff, 1);
+ }
+}
+
+TEST_F(libyuvTest, ScaleTo342) {
+ int src_width = 686;
+ int src_height = 557;
+ int dst_width = 342;
+ int dst_height = 276;
+
+ for (int f = 0; f < 3; ++f) {
+ int max_diff = TestFilter(src_width, src_height,
+ dst_width, dst_height,
+ static_cast<FilterMode>(f), 1,
+ benchmark_iterations_);
+ EXPECT_LE(max_diff, 1);
+ }
+}
+
+TEST_F(libyuvTest, ScaleToHalf342) {
+ int src_width = 684;
+ int src_height = 552;
+ int dst_width = 342;
+ int dst_height = 276;
+
+ for (int f = 0; f < 3; ++f) {
+ int max_diff = TestFilter(src_width, src_height,
+ dst_width, dst_height,
+ static_cast<FilterMode>(f), 1,
+ benchmark_iterations_);
+ EXPECT_LE(max_diff, 1);
+ }
+}
+
+} // namespace libyuv
diff --git a/files/unit_test/testdata/arm_v7.txt b/files/unit_test/testdata/arm_v7.txt
new file mode 100644
index 00000000..5d7dbd04
--- /dev/null
+++ b/files/unit_test/testdata/arm_v7.txt
@@ -0,0 +1,12 @@
+Processor : ARMv7 Processor rev 5 (v7l)
+BogoMIPS : 795.44
+Features : swp half thumb fastmult vfp edsp iwmmxt thumbee vfpv3 vfpv3d16
+CPU implementer : 0x56
+CPU architecture: 7
+CPU variant : 0x0
+CPU part : 0x581
+CPU revision : 5
+
+Hardware : OLPC XO-1.75
+Revision : 0000
+Serial : 0000000000000000
diff --git a/files/unit_test/testdata/tegra3.txt b/files/unit_test/testdata/tegra3.txt
new file mode 100644
index 00000000..d1b09f6b
--- /dev/null
+++ b/files/unit_test/testdata/tegra3.txt
@@ -0,0 +1,23 @@
+Processor : ARMv7 Processor rev 9 (v7l)
+processor : 0
+BogoMIPS : 1992.29
+
+processor : 1
+BogoMIPS : 1992.29
+
+processor : 2
+BogoMIPS : 1992.29
+
+processor : 3
+BogoMIPS : 1992.29
+
+Features : swp half thumb fastmult vfp edsp neon vfpv3
+CPU implementer : 0×41
+CPU architecture: 7
+CPU variant : 0×2
+CPU part : 0xc09
+CPU revision : 9
+
+Hardware : cardhu
+Revision : 0000
+
diff --git a/files/unit_test/unit_test.cc b/files/unit_test/unit_test.cc
index 1996adf1..007c81f0 100644
--- a/files/unit_test/unit_test.cc
+++ b/files/unit_test/unit_test.cc
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@@ -8,33 +8,26 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include <cstring>
-#include "unit_test.h"
-
-class libyuvEnvironment : public ::testing::Environment {
- public:
- virtual void SetUp() {
- }
+#include "../unit_test/unit_test.h"
- virtual void TearDown() {
- }
-};
-
-libyuvTest::libyuvTest() :
- _rotate_max_w(128),
- _rotate_max_h(128) {
-}
+#include <stdlib.h> // For getenv()
-void libyuvTest::SetUp() {
-}
+#include <cstring>
-void libyuvTest::TearDown() {
+// Change this to 1000 for benchmarking.
+// TODO(fbarchard): Add command line parsing to pass this as option.
+#define BENCHMARK_ITERATIONS 1
+
+libyuvTest::libyuvTest() : rotate_max_w_(128), rotate_max_h_(128),
+ benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(1280),
+ benchmark_height_(720) {
+ const char* repeat = getenv("LIBYUV_REPEAT");
+ if (repeat) {
+ benchmark_iterations_ = atoi(repeat); // NOLINT
+ }
}
int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);
- libyuvEnvironment* env = new libyuvEnvironment;
- ::testing::AddGlobalTestEnvironment(env);
-
return RUN_ALL_TESTS();
-} \ No newline at end of file
+}
diff --git a/files/unit_test/unit_test.h b/files/unit_test/unit_test.h
index cac30c72..62521e88 100644
--- a/files/unit_test/unit_test.h
+++ b/files/unit_test/unit_test.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@@ -8,20 +8,67 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef UINIT_TEST_H_
-#define UINIT_TEST_H_
+#ifndef UNIT_TEST_UNIT_TEST_H_
+#define UNIT_TEST_UNIT_TEST_H_
#include <gtest/gtest.h>
+#define align_buffer_16(var, size) \
+ uint8* var; \
+ uint8* var##_mem; \
+ var##_mem = reinterpret_cast<uint8*>(malloc((size) + 15)); \
+ var = reinterpret_cast<uint8*> \
+ ((reinterpret_cast<intptr_t>(var##_mem) + 15) & ~15);
+
+#define free_aligned_buffer_16(var) \
+ free(var##_mem); \
+ var = 0;
+
+
+#define align_buffer_page_end(var, size) \
+ uint8* var; \
+ uint8* var##_mem; \
+ var##_mem = reinterpret_cast<uint8*>(malloc(((size) + 4095) & ~4095)); \
+ var = var##_mem + (-(size) & 4095);
+
+#define free_aligned_buffer_page_end(var) \
+ free(var##_mem); \
+ var = 0;
+
+#ifdef WIN32
+#include <windows.h>
+static inline double get_time() {
+ LARGE_INTEGER t, f;
+ QueryPerformanceCounter(&t);
+ QueryPerformanceFrequency(&f);
+ return static_cast<double>(t.QuadPart) / static_cast<double>(f.QuadPart);
+}
+
+#define random rand
+#define srandom srand
+#else
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+static inline double get_time() {
+ struct timeval t;
+ struct timezone tzp;
+ gettimeofday(&t, &tzp);
+ return t.tv_sec + t.tv_usec * 1e-6;
+}
+#endif
+
class libyuvTest : public ::testing::Test {
protected:
libyuvTest();
- virtual void SetUp();
- virtual void TearDown();
- const int _rotate_max_w;
- const int _rotate_max_h;
+ const int rotate_max_w_;
+ const int rotate_max_h_;
+ int benchmark_iterations_;
+ const int benchmark_width_;
+ const int benchmark_height_;
};
-#endif // UNIT_TEST_H_
+#endif // UNIT_TEST_UNIT_TEST_H_
diff --git a/files/unit_test/version_test.cc b/files/unit_test/version_test.cc
new file mode 100644
index 00000000..c53d754c
--- /dev/null
+++ b/files/unit_test/version_test.cc
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "libyuv/basic_types.h"
+#include "libyuv/version.h"
+#include "../unit_test/unit_test.h"
+
+namespace libyuv {
+
+// Tests SVN version against include/libyuv/version.h
+// SVN version is bumped by documentation changes as well as code.
+// Although the versions should match, once checked in, a tolerance is allowed.
+TEST_F(libyuvTest, TestVersion) {
+ EXPECT_GE(LIBYUV_VERSION, 169); // 169 is first version to support version.
+ printf("LIBYUV_VERSION %d\n", LIBYUV_VERSION);
+#ifdef LIBYUV_SVNREVISION
+ const char *ver = strchr(LIBYUV_SVNREVISION, ':');
+ if (ver) {
+ ++ver;
+ } else {
+ ver = LIBYUV_SVNREVISION;
+ }
+ int svn_revision = atoi(ver); // NOLINT
+ printf("LIBYUV_SVNREVISION %d\n", svn_revision);
+ EXPECT_NEAR(LIBYUV_VERSION, svn_revision, 3); // Allow version to be close.
+ if (LIBYUV_VERSION != svn_revision) {
+ printf("WARNING - Versions do not match.\n");
+ }
+#endif
+}
+
+} // namespace libyuv
diff --git a/files/util/compare.cc b/files/util/compare.cc
new file mode 100644
index 00000000..f030c799
--- /dev/null
+++ b/files/util/compare.cc
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "libyuv/basic_types.h"
+#include "libyuv/compare.h"
+#include "libyuv/version.h"
+
+int main(int argc, char** argv) {
+ if (argc < 1) {
+ printf("libyuv compare v%d\n", LIBYUV_VERSION);
+ printf("compare file1.yuv file2.yuv\n");
+ return -1;
+ }
+ char* name1 = argv[1];
+ char* name2 = (argc > 2) ? argv[2] : NULL;
+ FILE* fin1 = fopen(name1, "rb");
+ FILE* fin2 = name2 ? fopen(name2, "rb") : NULL;
+
+ const int kBlockSize = 32768;
+ uint8 buf1[kBlockSize];
+ uint8 buf2[kBlockSize];
+ uint32 hash1 = 5381;
+ uint32 hash2 = 5381;
+ uint64 sum_square_err = 0;
+ uint64 size_min = 0;
+ int amt1 = 0;
+ int amt2 = 0;
+ do {
+ amt1 = fread(buf1, 1, kBlockSize, fin1);
+ if (amt1 > 0) hash1 = libyuv::HashDjb2(buf1, amt1, hash1);
+ if (fin2) {
+ amt2 = fread(buf2, 1, kBlockSize, fin2);
+ if (amt2 > 0) hash2 = libyuv::HashDjb2(buf2, amt2, hash2);
+ int amt_min = (amt1 < amt2) ? amt1 : amt2;
+ size_min += amt_min;
+ sum_square_err += libyuv::ComputeSumSquareError(buf1, buf2, amt_min);
+ }
+ } while (amt1 > 0 || amt2 > 0);
+
+ printf("hash1 %x", hash1);
+ if (fin2) {
+ printf(", hash2 %x", hash2);
+ double mse = static_cast<double>(sum_square_err) /
+ static_cast<double>(size_min);
+ printf(", mse %.2f", mse);
+ double psnr = libyuv::SumSquareErrorToPsnr(sum_square_err, size_min);
+ printf(", psnr %.2f\n", psnr);
+ fclose(fin2);
+ }
+ fclose(fin1);
+}
+