diff options
author | Shri Borde <shri@google.com> | 2011-11-02 13:20:24 -0700 |
---|---|---|
committer | Shri Borde <shri@google.com> | 2011-11-02 13:20:24 -0700 |
commit | 7cd8149e2cbad8b1ff6d481c37a4775d3c8cf2fa (patch) | |
tree | b33940212e8eae6d9df454f5461279da919629cf | |
parent | 2398a6ec900d592b1433dc24eeeecf442794eb10 (diff) | |
download | libyuv-7cd8149e2cbad8b1ff6d481c37a4775d3c8cf2fa.tar.gz |
Initial population of libyuv
Change-Id: I46a6a1525aebaba979b0f2ca5b58be2004901410
39 files changed, 13536 insertions, 0 deletions
diff --git a/Android.mk b/Android.mk new file mode 100644 index 00000000..4ad534ae --- /dev/null +++ b/Android.mk @@ -0,0 +1,43 @@ +# This is the Android makefile for google3/third_party/libsrtp so that we can +# build it with the Android NDK. +LOCAL_PATH := $(call my-dir) + +common_SRC_FILES := \ + files/source/convert.cc \ + files/source/format_conversion.cc \ + files/source/planar_functions.cc \ + files/source/row_posix.cc \ + files/source/video_common.cc \ + files/source/cpu_id.cc \ + files/source/general.cc \ + files/source/rotate.cc \ + files/source/row_table.cc \ + files/source/scale.cc + +common_CFLAGS := -Wall -fexceptions + +common_C_INCLUDES = $(LOCAL_PATH)/files/include + +# For the device +# ===================================================== +# Device static library + +include $(CLEAR_VARS) + +LOCAL_CPP_EXTENSION := .cc + +ifneq ($(TARGET_ARCH),x86) + LOCAL_NDK_VERSION := 5 + LOCAL_SDK_VERSION := 9 + LOCAL_NDK_STL_VARIANT := stlport_static +endif + + +LOCAL_SRC_FILES := $(common_SRC_FILES) +LOCAL_CFLAGS += $(common_CFLAGS) +LOCAL_C_INCLUDES += $(common_C_INCLUDES) + +LOCAL_MODULE:= libyuv_static +LOCAL_MODULE_TAGS := optional + +include $(BUILD_STATIC_LIBRARY) @@ -0,0 +1,14 @@ +# Copyright 2011 Google Inc. All Rights Reserved. +# +# Description: +# The libyuv package provides implementation yuv image conversion and +# scaling. +# +# This library is used by Talk Video and WebRTC. +# + +licenses(['notice']) # 3-clause BSD + +exports_files(['LICENSE']) + +package(default_visibility = ['//visibility:public']) diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..da40b336 --- /dev/null +++ b/LICENSE @@ -0,0 +1,29 @@ +Copyright (c) 2011, Google Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + + * Neither the name of Google nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. @@ -0,0 +1,7 @@ +fbarchard +mikhal +hbono +juberti +thaloun +tschmelcher +whyuan diff --git a/README.google b/README.google new file mode 100644 index 00000000..79828ab8 --- /dev/null +++ b/README.google @@ -0,0 +1,12 @@ +URL: http://libyuv.googlecode.com/svn-history/r52/trunk/ +Version: r52 +License: BSD +License File: LICENSE + +Description: +libyuv is an open-source library for yuv conversion and scaling. +Specifically libyuv is optimized for SSE2/SSSE3 and Neon and has demonstrated +speed up to 10x to 16x compared to C code. + +Local Modifications: +None diff --git a/files/LICENSE b/files/LICENSE new file mode 100644 index 00000000..da40b336 --- /dev/null +++ b/files/LICENSE @@ -0,0 +1,29 @@ +Copyright (c) 2011, Google Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + + * Neither the name of Google nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/files/PATENTS b/files/PATENTS new file mode 100644 index 00000000..64aa5c90 --- /dev/null +++ b/files/PATENTS @@ -0,0 +1,24 @@ +Additional IP Rights Grant (Patents) + +"This implementation" means the copyrightable works distributed by +Google as part of the LibYuv code package. + +Google hereby grants to you a perpetual, worldwide, non-exclusive, +no-charge, irrevocable (except as stated in this section) patent +license to make, have made, use, offer to sell, sell, import, +transfer, and otherwise run, modify and propagate the contents of this +implementation of the LibYuv code package, where such license applies +only to those patent claims, both currently owned by Google and +acquired in the future, licensable by Google that are necessarily +infringed by this implementation of the LibYuv code package. This +grant does not include claims that would be infringed only as a +consequence of further modification of this implementation. If you or +your agent or exclusive licensee institute or order or agree to the +institution of patent litigation against any entity (including a +cross-claim or counterclaim in a lawsuit) alleging that this +implementation of the LibYuv code package or any code incorporated +within this implementation of the LibYuv code package constitutes +direct or contributory patent infringement, or inducement of patent +infringement, then any patent rights granted to you under this License +for this implementation of the LibYuv code package shall terminate as +of the date such litigation is filed.
\ No newline at end of file diff --git a/files/include/libyuv.h b/files/include/libyuv.h new file mode 100644 index 00000000..5a30e2d0 --- /dev/null +++ b/files/include/libyuv.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef LIBYUV_INCLUDE_LIBYUV_H_ +#define LIBYUV_INCLUDE_LIBYUV_H_ + +#include "libyuv/basic_types.h" +#include "libyuv/convert.h" +#include "libyuv/cpu_id.h" +#include "libyuv/format_conversion.h" +#include "libyuv/general.h" +#include "libyuv/planar_functions.h" +#include "libyuv/rotate.h" +#include "libyuv/scale.h" + +#endif // LIBYUV_INCLUDE_LIBYUV_H_ diff --git a/files/include/libyuv/basic_types.h b/files/include/libyuv/basic_types.h new file mode 100644 index 00000000..5adc2bfd --- /dev/null +++ b/files/include/libyuv/basic_types.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_ +#define INCLUDE_LIBYUV_BASIC_TYPES_H_ + +#include <stddef.h> // for NULL, size_t + +#ifndef WIN32 +#include <stdint.h> // for uintptr_t +#endif + +#ifndef INT_TYPES_DEFINED +#define INT_TYPES_DEFINED +#ifdef COMPILER_MSVC +typedef __int64 int64; +#else +typedef long long int64; +#endif /* COMPILER_MSVC */ +typedef int int32; +typedef short int16; +typedef char int8; + +#ifdef COMPILER_MSVC +typedef unsigned __int64 uint64; +typedef __int64 int64; +#ifndef INT64_C +#define INT64_C(x) x ## I64 +#endif +#ifndef UINT64_C +#define UINT64_C(x) x ## UI64 +#endif +#define INT64_F "I64" +#else +typedef unsigned long long uint64; +typedef long long int64; +#ifndef INT64_C +#define INT64_C(x) x ## LL +#endif +#ifndef UINT64_C +#define UINT64_C(x) x ## ULL +#endif +#define INT64_F "ll" +#endif /* COMPILER_MSVC */ +typedef unsigned int uint32; +typedef unsigned short uint16; +typedef unsigned char uint8; +#endif // INT_TYPES_DEFINED + +// Detect compiler is for x86 or x64. +#if defined(__x86_64__) || defined(_M_X64) || \ + defined(__i386__) || defined(_M_IX86) +#define CPU_X86 1 +#endif + +#define IS_ALIGNED(p, a) (0==(reinterpret_cast<uintptr_t>(p) & ((a)-1))) +#define ALIGNP(p, t) \ + (reinterpret_cast<uint8*>(((reinterpret_cast<uintptr_t>(p) + \ + ((t)-1)) & ~((t)-1)))) + +#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_ diff --git a/files/include/libyuv/convert.h b/files/include/libyuv/convert.h new file mode 100644 index 00000000..fa3b6446 --- /dev/null +++ b/files/include/libyuv/convert.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef INCLUDE_LIBYUV_CONVERT_H_ +#define INCLUDE_LIBYUV_CONVERT_H_ + +#include "libyuv/basic_types.h" + +namespace libyuv { + +int I420ToRGB24(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +int I420ToARGB4444(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +int I420ToRGB565(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +int I420ToARGB1555(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +int I420ToYUY2(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +int I420ToUYVY(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +// TODO(fbarchard): Deprecated - this is same as BG24ToARGB with -height +int RGB24ToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +int RGB24ToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +int RAWToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +int ABGRToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +int BGRAToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +int ARGBToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +int NV12ToRGB565(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +} // namespace libyuv + +#endif // INCLUDE_LIBYUV_CONVERT_H_ diff --git a/files/include/libyuv/cpu_id.h b/files/include/libyuv/cpu_id.h new file mode 100644 index 00000000..c1000e86 --- /dev/null +++ b/files/include/libyuv/cpu_id.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_CPU_ID_H_ +#define INCLUDE_LIBYUV_CPU_ID_H_ + +namespace libyuv { + +// These flags are only valid on x86 processors +static const int kCpuHasSSE2 = 1; +static const int kCpuHasSSSE3 = 2; + +// These flags are only valid on ARM processors +static const int kCpuHasNEON = 4; + +// Internal flag to indicate cpuid is initialized. +static const int kCpuInitialized = 8; + +// Detect CPU has SSE2 etc. +bool TestCpuFlag(int flag); + +// For testing, allow CPU flags to be disabled. +// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3. -1 to enable all. +void MaskCpuFlags(int enable_flags); + +} // namespace libyuv + +#endif // INCLUDE_LIBYUV_CPU_ID_H_ diff --git a/files/include/libyuv/format_conversion.h b/files/include/libyuv/format_conversion.h new file mode 100644 index 00000000..d3d36f38 --- /dev/null +++ b/files/include/libyuv/format_conversion.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef INCLUDE_LIBYUV_FORMATCONVERSION_H_ +#define INCLUDE_LIBYUV_FORMATCONVERSION_H_ + +#include "libyuv/basic_types.h" + +namespace libyuv { + +// Converts any Bayer RGB format to I420. +int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer, + uint32 src_fourcc_bayer, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Converts any Bayer RGB format to ARGB. +int BayerRGBToARGB(const uint8* src_bayer, int src_stride_bayer, + uint32 src_fourcc_bayer, + uint8* dst_rgb, int dst_stride_rgb, + int width, int height); + +// Converts ARGB to any Bayer RGB format. +int ARGBToBayerRGB(const uint8* src_rgb, int src_stride_rgb, + uint8* dst_bayer, int dst_stride_bayer, + uint32 dst_fourcc_bayer, + int width, int height); + +} // namespace libyuv + +#endif // INCLUDE_LIBYUV_FORMATCONVERSION_H_ diff --git a/files/include/libyuv/general.h b/files/include/libyuv/general.h new file mode 100644 index 00000000..58943c86 --- /dev/null +++ b/files/include/libyuv/general.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +/* + * General operations on YUV images. + */ + +#ifndef INCLUDE_LIBYUV_GENERAL_H_ +#define INCLUDE_LIBYUV_GENERAL_H_ + +#include "libyuv/basic_types.h" + +namespace libyuv { + +// I420 mirror +int +I420Mirror(const uint8* src_yplane, int src_ystride, + const uint8* src_uplane, int src_ustride, + const uint8* src_vplane, int src_vstride, + uint8* dst_yplane, int dst_ystride, + uint8* dst_uplane, int dst_ustride, + uint8* dst_vplane, int dst_vstride, + int width, int height); + +// Crop/Pad I420 frame to match required dimensions. +int +I420CropPad(const uint8* src_frame, int src_width, + int src_height, uint8* dst_frame, + int dst_width, int dst_height); + +// I420 Crop - crop a rectangle from image +int +I420Crop(uint8* frame, + int src_width, int src_height, + int dst_width, int dst_height); + +} // namespace libyuv + +#endif // INCLUDE_LIBYUV_GENERAL_H_ diff --git a/files/include/libyuv/planar_functions.h b/files/include/libyuv/planar_functions.h new file mode 100644 index 00000000..9c0a10a3 --- /dev/null +++ b/files/include/libyuv/planar_functions.h @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ +#define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ + +#include "libyuv/basic_types.h" + +namespace libyuv { + +// Copy I420 to I420. +int I420Copy(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Draw a rectangle into I420 +int I420Rect(uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int x, int y, + int width, int height, + int value_y, int value_u, int value_v); + +// Convert I422 to I420. Used by MJPG. +int I422ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert NV12 to I420. Also used for NV21. +int NV12ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert NV12 to I420. Deprecated. +int NV12ToI420(const uint8* src_y, + const uint8* src_uv, int src_stride, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert Q420 to I420. +int Q420ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert M420 to I420. +int M420ToI420(const uint8* src_m420, int src_stride_m420, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert YUY2 to I420. +int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert UYVY to I420. +int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert I420 to ARGB. +int I420ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I420 to BGRA. +int I420ToBGRA(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I420 to ABGR. +int I420ToABGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I422 to ARGB. +int I422ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I444 to ARGB. +int I444ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I400 to ARGB. +int I400ToARGB(const uint8* src_y, int src_stride_y, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I400 to ARGB. Reverse of ARGBToI400 +int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert RAW to ARGB. +int RAWToARGB(const uint8* src_raw, int src_stride_raw, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert BG24 to ARGB. +int BG24ToARGB(const uint8* src_bg24, int src_stride_bg24, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert ABGR to ARGB. Also used for ARGB to ABGR. +int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert BGRA to ARGB. Also used for ARGB to BGRA. +int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert ARGB to I400. +int ARGBToI400(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + int width, int height); + +} // namespace libyuv + +#endif // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ diff --git a/files/include/libyuv/rotate.h b/files/include/libyuv/rotate.h new file mode 100644 index 00000000..65c38de3 --- /dev/null +++ b/files/include/libyuv/rotate.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_ROTATE_H_ +#define INCLUDE_LIBYUV_ROTATE_H_ + +#include "libyuv/basic_types.h" + +namespace libyuv { + +// Supported rotation +enum RotationMode { + kRotate0 = 0, // No rotation + kRotate90 = 90, // Rotate 90 degrees clockwise + kRotate180 = 180, // Rotate 180 degrees + kRotate270 = 270, // Rotate 270 degrees clockwise + + // Deprecated + kRotateNone = 0, + kRotateClockwise = 90, + kRotateCounterClockwise = 270, +}; + +// Rotate I420 frame +int I420Rotate(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height, + RotationMode mode); + +// Rotate NV12 input and store in I420 +int NV12ToI420Rotate(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height, + RotationMode mode); + +} // namespace libyuv + +#endif // INCLUDE_LIBYUV_ROTATE_H_ diff --git a/files/include/libyuv/scale.h b/files/include/libyuv/scale.h new file mode 100644 index 00000000..8433908b --- /dev/null +++ b/files/include/libyuv/scale.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_SCALE_H_ +#define INCLUDE_LIBYUV_SCALE_H_ + +#include "libyuv/basic_types.h" + +namespace libyuv { + +// Supported filtering +enum FilterMode { + kFilterNone = 0, // Point sample; Fastest + kFilterBilinear = 1, // Faster than box, but lower quality scaling down. + kFilterBox = 2 // Highest quality +}; + +// Scales a YUV 4:2:0 image from the src width and height to the +// dst width and height. +// If filtering is kFilterNone, a simple nearest-neighbor algorithm is +// used. This produces basic (blocky) quality at the fastest speed. +// If filtering is kFilterBilinear, interpolation is used to produce a better +// quality image, at the expense of speed. +// If filtering is kFilterBox, averaging is used to produce ever better +// quality image, at further expense of speed. +// Returns 0 if successful. + +int I420Scale(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + int src_width, int src_height, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int dst_width, int dst_height, + FilterMode filtering); + +// Legacy API +// If dst_height_offset is non-zero, the image is offset by that many pixels +// and stretched to (dst_height - dst_height_offset * 2) pixels high, +// instead of dst_height. +int Scale(const uint8* src, int src_width, int src_height, + uint8* dst, int dst_width, int dst_height, int dst_height_offset, + bool interpolate); + +// Same, but specified src terms of each plane location and stride. +int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v, + int src_stride_y, int src_stride_u, int src_stride_v, + int src_width, int src_height, + uint8* dst_y, uint8* dst_u, uint8* dst_v, + int dst_stride_y, int dst_stride_u, int dst_stride_v, + int dst_width, int dst_height, + bool interpolate); + +// For testing, allow disabling of optimizations. +void SetUseReferenceImpl(bool use); + +} // namespace libyuv + +#endif // INCLUDE_LIBYUV_SCALE_H_ diff --git a/files/libyuv.gyp b/files/libyuv.gyp new file mode 100644 index 00000000..d5abab73 --- /dev/null +++ b/files/libyuv.gyp @@ -0,0 +1,70 @@ +# Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. +# +# Use of this source code is governed by a BSD-style license +# that can be found in the LICENSE file in the root of the source +# tree. An additional intellectual property rights grant can be found +# in the file PATENTS. All contributing project authors may +# be found in the AUTHORS file in the root of the source tree. + +{ + 'targets': [ + { + 'target_name': 'libyuv', + 'type': 'static_library', + 'include_dirs': [ + 'common', + 'include', + ], + 'direct_dependent_settings': { + 'include_dirs': [ + 'common', + 'include', + ], + }, + 'sources': [ + # includes + 'include/convert.h', + 'include/general.h', + 'include/scale.h', + 'include/planar_functions.h', + + # headers + 'common/basic_types.h', + 'common/common.h', + 'common/constructor_magic.h', + 'source/cpu_id.h', + 'source/rotate.h' + 'source/row.h', + 'source/video_common.h', + + # sources + 'source/convert.cc', + 'source/cpu_id.cc', + 'source/format_conversion.cc', + 'source/general.cc', + 'source/planar_functions.cc', + 'source/rotate.cc', + 'source/row_table.cc', + 'source/scale.cc', + 'source/video_common.cc', + ], + 'conditions': [ + ['OS=="win"', { + 'sources': [ + 'source/row_win.cc', + ], + },{ # else + 'sources': [ + 'source/row_posix.cc', + ], + }], + ] + }, + ], # targets +} + +# Local Variables: +# tab-width:2 +# indent-tabs-mode:nil +# End: +# vim: set expandtab tabstop=2 shiftwidth=2: diff --git a/files/source/conversion_tables.h b/files/source/conversion_tables.h new file mode 100644 index 00000000..9a328649 --- /dev/null +++ b/files/source/conversion_tables.h @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/************************************************************** +* conversion_tables.h +* +* Pre-compiled definitions of the conversion equations: YUV -> RGB. +* +***************************************************************/ + +#ifndef LIBYUV_SOURCE_CONVERSION_TABLES_H_ +#define LIBYUV_SOURCE_CONVERSION_TABLES_H_ + +namespace libyuv { + +/****************************************************************************** +* YUV TO RGB approximation +* +* R = clip( (298 * (Y - 16) + 409 * (V - 128) + 128 ) >> 8 ) +* G = clip( (298 * (Y - 16) - 100 * (U - 128) - 208 * (V - 128) + 128 ) >> 8 ) +* B = clip( (298 * (Y - 16) + 516 * (U - 128) + 128 ) >> 8 ) +*******************************************************************************/ + + #define Yc(i) static_cast<int> ( 298 * ( i - 16 )) // Y contribution + #define Ucg(i) static_cast<int> ( -100 * ( i - 128 ))// U contribution to G + #define Ucb(i) static_cast<int> ( 516 * ( i - 128 ))// U contribution to B + #define Vcr(i) static_cast<int> ( 409 * ( i - 128 ))// V contribution to R + #define Vcg(i) static_cast<int> ( -208 * ( i - 128 ))// V contribution to G + + static const int mapYc[256] = { + Yc(0),Yc(1),Yc(2),Yc(3),Yc(4),Yc(5),Yc(6),Yc(7),Yc(8),Yc(9), + Yc(10),Yc(11),Yc(12),Yc(13),Yc(14),Yc(15),Yc(16),Yc(17),Yc(18),Yc(19), + Yc(20),Yc(21),Yc(22),Yc(23),Yc(24),Yc(25),Yc(26),Yc(27),Yc(28),Yc(29), + Yc(30),Yc(31),Yc(32),Yc(33),Yc(34),Yc(35),Yc(36),Yc(37),Yc(38),Yc(39), + Yc(40),Yc(41),Yc(42),Yc(43),Yc(44),Yc(45),Yc(46),Yc(47),Yc(48),Yc(49), + Yc(50),Yc(51),Yc(52),Yc(53),Yc(54),Yc(55),Yc(56),Yc(57),Yc(58),Yc(59), + Yc(60),Yc(61),Yc(62),Yc(63),Yc(64),Yc(65),Yc(66),Yc(67),Yc(68),Yc(69), + Yc(70),Yc(71),Yc(72),Yc(73),Yc(74),Yc(75),Yc(76),Yc(77),Yc(78),Yc(79), + Yc(80),Yc(81),Yc(82),Yc(83),Yc(84),Yc(85),Yc(86),Yc(87),Yc(88),Yc(89), + Yc(90),Yc(91),Yc(92),Yc(93),Yc(94),Yc(95),Yc(96),Yc(97),Yc(98),Yc(99), + Yc(100),Yc(101),Yc(102),Yc(103),Yc(104),Yc(105),Yc(106),Yc(107),Yc(108), + Yc(109),Yc(110),Yc(111),Yc(112),Yc(113),Yc(114),Yc(115),Yc(116),Yc(117), + Yc(118),Yc(119),Yc(120),Yc(121),Yc(122),Yc(123),Yc(124),Yc(125),Yc(126), + Yc(127),Yc(128),Yc(129),Yc(130),Yc(131),Yc(132),Yc(133),Yc(134),Yc(135), + Yc(136),Yc(137),Yc(138),Yc(139),Yc(140),Yc(141),Yc(142),Yc(143),Yc(144), + Yc(145),Yc(146),Yc(147),Yc(148),Yc(149),Yc(150),Yc(151),Yc(152),Yc(153), + Yc(154),Yc(155),Yc(156),Yc(157),Yc(158),Yc(159),Yc(160),Yc(161),Yc(162), + Yc(163),Yc(164),Yc(165),Yc(166),Yc(167),Yc(168),Yc(169),Yc(170),Yc(171), + Yc(172),Yc(173),Yc(174),Yc(175),Yc(176),Yc(177),Yc(178),Yc(179),Yc(180), + Yc(181),Yc(182),Yc(183),Yc(184),Yc(185),Yc(186),Yc(187),Yc(188),Yc(189), + Yc(190),Yc(191),Yc(192),Yc(193),Yc(194),Yc(195),Yc(196),Yc(197),Yc(198), + Yc(199),Yc(200),Yc(201),Yc(202),Yc(203),Yc(204),Yc(205),Yc(206),Yc(207), + Yc(208),Yc(209),Yc(210),Yc(211),Yc(212),Yc(213),Yc(214),Yc(215),Yc(216), + Yc(217),Yc(218),Yc(219),Yc(220),Yc(221),Yc(222),Yc(223),Yc(224),Yc(225), + Yc(226),Yc(227),Yc(228),Yc(229),Yc(230),Yc(231),Yc(232),Yc(233),Yc(234), + Yc(235),Yc(236),Yc(237),Yc(238),Yc(239),Yc(240),Yc(241),Yc(242),Yc(243), + Yc(244),Yc(245),Yc(246),Yc(247),Yc(248),Yc(249),Yc(250),Yc(251),Yc(252), + Yc(253),Yc(254),Yc(255)}; + + static const int mapUcg[256] = { + Ucg(0),Ucg(1),Ucg(2),Ucg(3),Ucg(4),Ucg(5),Ucg(6),Ucg(7),Ucg(8),Ucg(9), + Ucg(10),Ucg(11),Ucg(12),Ucg(13),Ucg(14),Ucg(15),Ucg(16),Ucg(17),Ucg(18), + Ucg(19),Ucg(20),Ucg(21),Ucg(22),Ucg(23),Ucg(24),Ucg(25),Ucg(26),Ucg(27), + Ucg(28),Ucg(29),Ucg(30),Ucg(31),Ucg(32),Ucg(33),Ucg(34),Ucg(35),Ucg(36), + Ucg(37),Ucg(38),Ucg(39),Ucg(40),Ucg(41),Ucg(42),Ucg(43),Ucg(44),Ucg(45), + Ucg(46),Ucg(47),Ucg(48),Ucg(49),Ucg(50),Ucg(51),Ucg(52),Ucg(53),Ucg(54), + Ucg(55),Ucg(56),Ucg(57),Ucg(58),Ucg(59),Ucg(60),Ucg(61),Ucg(62),Ucg(63), + Ucg(64),Ucg(65),Ucg(66),Ucg(67),Ucg(68),Ucg(69),Ucg(70),Ucg(71),Ucg(72), + Ucg(73),Ucg(74),Ucg(75),Ucg(76),Ucg(77),Ucg(78),Ucg(79),Ucg(80),Ucg(81), + Ucg(82),Ucg(83),Ucg(84),Ucg(85),Ucg(86),Ucg(87),Ucg(88),Ucg(89),Ucg(90), + Ucg(91),Ucg(92),Ucg(93),Ucg(94),Ucg(95),Ucg(96),Ucg(97),Ucg(98),Ucg(99), + Ucg(100),Ucg(101),Ucg(102),Ucg(103),Ucg(104),Ucg(105),Ucg(106),Ucg(107), + Ucg(108),Ucg(109),Ucg(110),Ucg(111),Ucg(112),Ucg(113),Ucg(114),Ucg(115), + Ucg(116),Ucg(117),Ucg(118),Ucg(119),Ucg(120),Ucg(121),Ucg(122),Ucg(123), + Ucg(124),Ucg(125),Ucg(126),Ucg(127),Ucg(128),Ucg(129),Ucg(130),Ucg(131), + Ucg(132),Ucg(133),Ucg(134),Ucg(135),Ucg(136),Ucg(137),Ucg(138),Ucg(139), + Ucg(140),Ucg(141),Ucg(142),Ucg(143),Ucg(144),Ucg(145),Ucg(146),Ucg(147), + Ucg(148),Ucg(149),Ucg(150),Ucg(151),Ucg(152),Ucg(153),Ucg(154),Ucg(155), + Ucg(156),Ucg(157),Ucg(158),Ucg(159),Ucg(160),Ucg(161),Ucg(162),Ucg(163), + Ucg(164),Ucg(165),Ucg(166),Ucg(167),Ucg(168),Ucg(169),Ucg(170),Ucg(171), + Ucg(172),Ucg(173),Ucg(174),Ucg(175),Ucg(176),Ucg(177),Ucg(178),Ucg(179), + Ucg(180),Ucg(181),Ucg(182),Ucg(183),Ucg(184),Ucg(185),Ucg(186),Ucg(187), + Ucg(188),Ucg(189),Ucg(190),Ucg(191),Ucg(192),Ucg(193),Ucg(194),Ucg(195), + Ucg(196),Ucg(197),Ucg(198),Ucg(199),Ucg(200),Ucg(201),Ucg(202),Ucg(203), + Ucg(204),Ucg(205),Ucg(206),Ucg(207),Ucg(208),Ucg(209),Ucg(210),Ucg(211), + Ucg(212),Ucg(213),Ucg(214),Ucg(215),Ucg(216),Ucg(217),Ucg(218),Ucg(219), + Ucg(220),Ucg(221),Ucg(222),Ucg(223),Ucg(224),Ucg(225),Ucg(226),Ucg(227), + Ucg(228),Ucg(229),Ucg(230),Ucg(231),Ucg(232),Ucg(233),Ucg(234),Ucg(235), + Ucg(236),Ucg(237),Ucg(238),Ucg(239),Ucg(240),Ucg(241),Ucg(242),Ucg(243), + Ucg(244),Ucg(245),Ucg(246),Ucg(247),Ucg(248),Ucg(249),Ucg(250),Ucg(251), + Ucg(252),Ucg(253),Ucg(254),Ucg(255)}; + + static const int mapUcb[256] = { + Ucb(0),Ucb(1),Ucb(2),Ucb(3),Ucb(4),Ucb(5),Ucb(6),Ucb(7),Ucb(8),Ucb(9), + Ucb(10),Ucb(11),Ucb(12),Ucb(13),Ucb(14),Ucb(15),Ucb(16),Ucb(17),Ucb(18), + Ucb(19),Ucb(20),Ucb(21),Ucb(22),Ucb(23),Ucb(24),Ucb(25),Ucb(26),Ucb(27), + Ucb(28),Ucb(29),Ucb(30),Ucb(31),Ucb(32),Ucb(33),Ucb(34),Ucb(35),Ucb(36), + Ucb(37),Ucb(38),Ucb(39),Ucb(40),Ucb(41),Ucb(42),Ucb(43),Ucb(44),Ucb(45), + Ucb(46),Ucb(47),Ucb(48),Ucb(49),Ucb(50),Ucb(51),Ucb(52),Ucb(53),Ucb(54), + Ucb(55),Ucb(56),Ucb(57),Ucb(58),Ucb(59),Ucb(60),Ucb(61),Ucb(62),Ucb(63), + Ucb(64),Ucb(65),Ucb(66),Ucb(67),Ucb(68),Ucb(69),Ucb(70),Ucb(71),Ucb(72), + Ucb(73),Ucb(74),Ucb(75),Ucb(76),Ucb(77),Ucb(78),Ucb(79),Ucb(80),Ucb(81), + Ucb(82),Ucb(83),Ucb(84),Ucb(85),Ucb(86),Ucb(87),Ucb(88),Ucb(89),Ucb(90), + Ucb(91),Ucb(92),Ucb(93),Ucb(94),Ucb(95),Ucb(96),Ucb(97),Ucb(98),Ucb(99), + Ucb(100),Ucb(101),Ucb(102),Ucb(103),Ucb(104),Ucb(105),Ucb(106),Ucb(107), + Ucb(108),Ucb(109),Ucb(110),Ucb(111),Ucb(112),Ucb(113),Ucb(114),Ucb(115), + Ucb(116),Ucb(117),Ucb(118),Ucb(119),Ucb(120),Ucb(121),Ucb(122),Ucb(123), + Ucb(124),Ucb(125),Ucb(126),Ucb(127),Ucb(128),Ucb(129),Ucb(130),Ucb(131), + Ucb(132),Ucb(133),Ucb(134),Ucb(135),Ucb(136),Ucb(137),Ucb(138),Ucb(139), + Ucb(140),Ucb(141),Ucb(142),Ucb(143),Ucb(144),Ucb(145),Ucb(146),Ucb(147), + Ucb(148),Ucb(149),Ucb(150),Ucb(151),Ucb(152),Ucb(153),Ucb(154),Ucb(155), + Ucb(156),Ucb(157),Ucb(158),Ucb(159),Ucb(160),Ucb(161),Ucb(162),Ucb(163), + Ucb(164),Ucb(165),Ucb(166),Ucb(167),Ucb(168),Ucb(169),Ucb(170),Ucb(171), + Ucb(172),Ucb(173),Ucb(174),Ucb(175),Ucb(176),Ucb(177),Ucb(178),Ucb(179), + Ucb(180),Ucb(181),Ucb(182),Ucb(183),Ucb(184),Ucb(185),Ucb(186),Ucb(187), + Ucb(188),Ucb(189),Ucb(190),Ucb(191),Ucb(192),Ucb(193),Ucb(194),Ucb(195), + Ucb(196),Ucb(197),Ucb(198),Ucb(199),Ucb(200),Ucb(201),Ucb(202),Ucb(203), + Ucb(204),Ucb(205),Ucb(206),Ucb(207),Ucb(208),Ucb(209),Ucb(210),Ucb(211), + Ucb(212),Ucb(213),Ucb(214),Ucb(215),Ucb(216),Ucb(217),Ucb(218),Ucb(219), + Ucb(220),Ucb(221),Ucb(222),Ucb(223),Ucb(224),Ucb(225),Ucb(226),Ucb(227), + Ucb(228),Ucb(229),Ucb(230),Ucb(231),Ucb(232),Ucb(233),Ucb(234),Ucb(235), + Ucb(236),Ucb(237),Ucb(238),Ucb(239),Ucb(240),Ucb(241),Ucb(242),Ucb(243), + Ucb(244),Ucb(245),Ucb(246),Ucb(247),Ucb(248),Ucb(249),Ucb(250),Ucb(251), + Ucb(252),Ucb(253),Ucb(254),Ucb(255)}; + + static const int mapVcr[256] = { + Vcr(0),Vcr(1),Vcr(2),Vcr(3),Vcr(4),Vcr(5),Vcr(6),Vcr(7),Vcr(8),Vcr(9), + Vcr(10),Vcr(11),Vcr(12),Vcr(13),Vcr(14),Vcr(15),Vcr(16),Vcr(17),Vcr(18), + Vcr(19),Vcr(20),Vcr(21),Vcr(22),Vcr(23),Vcr(24),Vcr(25),Vcr(26),Vcr(27), + Vcr(28),Vcr(29),Vcr(30),Vcr(31),Vcr(32),Vcr(33),Vcr(34),Vcr(35),Vcr(36), + Vcr(37),Vcr(38),Vcr(39),Vcr(40),Vcr(41),Vcr(42),Vcr(43),Vcr(44),Vcr(45), + Vcr(46),Vcr(47),Vcr(48),Vcr(49),Vcr(50),Vcr(51),Vcr(52),Vcr(53),Vcr(54), + Vcr(55),Vcr(56),Vcr(57),Vcr(58),Vcr(59),Vcr(60),Vcr(61),Vcr(62),Vcr(63), + Vcr(64),Vcr(65),Vcr(66),Vcr(67),Vcr(68),Vcr(69),Vcr(70),Vcr(71),Vcr(72), + Vcr(73),Vcr(74),Vcr(75),Vcr(76),Vcr(77),Vcr(78),Vcr(79),Vcr(80),Vcr(81), + Vcr(82),Vcr(83),Vcr(84),Vcr(85),Vcr(86),Vcr(87),Vcr(88),Vcr(89),Vcr(90), + Vcr(91),Vcr(92),Vcr(93),Vcr(94),Vcr(95),Vcr(96),Vcr(97),Vcr(98),Vcr(99), + Vcr(100),Vcr(101),Vcr(102),Vcr(103),Vcr(104),Vcr(105),Vcr(106),Vcr(107), + Vcr(108),Vcr(109),Vcr(110),Vcr(111),Vcr(112),Vcr(113),Vcr(114),Vcr(115), + Vcr(116),Vcr(117),Vcr(118),Vcr(119),Vcr(120),Vcr(121),Vcr(122),Vcr(123), + Vcr(124),Vcr(125),Vcr(126),Vcr(127),Vcr(128),Vcr(129),Vcr(130),Vcr(131), + Vcr(132),Vcr(133),Vcr(134),Vcr(135),Vcr(136),Vcr(137),Vcr(138),Vcr(139), + Vcr(140),Vcr(141),Vcr(142),Vcr(143),Vcr(144),Vcr(145),Vcr(146),Vcr(147), + Vcr(148),Vcr(149),Vcr(150),Vcr(151),Vcr(152),Vcr(153),Vcr(154),Vcr(155), + Vcr(156),Vcr(157),Vcr(158),Vcr(159),Vcr(160),Vcr(161),Vcr(162),Vcr(163), + Vcr(164),Vcr(165),Vcr(166),Vcr(167),Vcr(168),Vcr(169),Vcr(170),Vcr(171), + Vcr(172),Vcr(173),Vcr(174),Vcr(175),Vcr(176),Vcr(177),Vcr(178),Vcr(179), + Vcr(180),Vcr(181),Vcr(182),Vcr(183),Vcr(184),Vcr(185),Vcr(186),Vcr(187), + Vcr(188),Vcr(189),Vcr(190),Vcr(191),Vcr(192),Vcr(193),Vcr(194),Vcr(195), + Vcr(196),Vcr(197),Vcr(198),Vcr(199),Vcr(200),Vcr(201),Vcr(202),Vcr(203), + Vcr(204),Vcr(205),Vcr(206),Vcr(207),Vcr(208),Vcr(209),Vcr(210),Vcr(211), + Vcr(212),Vcr(213),Vcr(214),Vcr(215),Vcr(216),Vcr(217),Vcr(218),Vcr(219), + Vcr(220),Vcr(221),Vcr(222),Vcr(223),Vcr(224),Vcr(225),Vcr(226),Vcr(227), + Vcr(228),Vcr(229),Vcr(230),Vcr(231),Vcr(232),Vcr(233),Vcr(234),Vcr(235), + Vcr(236),Vcr(237),Vcr(238),Vcr(239),Vcr(240),Vcr(241),Vcr(242),Vcr(243), + Vcr(244),Vcr(245),Vcr(246),Vcr(247),Vcr(248),Vcr(249),Vcr(250),Vcr(251), + Vcr(252),Vcr(253),Vcr(254),Vcr(255)}; + + + static const int mapVcg[256] = { + Vcg(0),Vcg(1),Vcg(2),Vcg(3),Vcg(4),Vcg(5),Vcg(6),Vcg(7),Vcg(8),Vcg(9), + Vcg(10),Vcg(11),Vcg(12),Vcg(13),Vcg(14),Vcg(15),Vcg(16),Vcg(17),Vcg(18), + Vcg(19),Vcg(20),Vcg(21),Vcg(22),Vcg(23),Vcg(24),Vcg(25),Vcg(26),Vcg(27), + Vcg(28),Vcg(29),Vcg(30),Vcg(31),Vcg(32),Vcg(33),Vcg(34),Vcg(35),Vcg(36), + Vcg(37),Vcg(38),Vcg(39),Vcg(40),Vcg(41),Vcg(42),Vcg(43),Vcg(44),Vcg(45), + Vcg(46),Vcg(47),Vcg(48),Vcg(49),Vcg(50),Vcg(51),Vcg(52),Vcg(53),Vcg(54), + Vcg(55),Vcg(56),Vcg(57),Vcg(58),Vcg(59),Vcg(60),Vcg(61),Vcg(62),Vcg(63), + Vcg(64),Vcg(65),Vcg(66),Vcg(67),Vcg(68),Vcg(69),Vcg(70),Vcg(71),Vcg(72), + Vcg(73),Vcg(74),Vcg(75),Vcg(76),Vcg(77),Vcg(78),Vcg(79),Vcg(80),Vcg(81), + Vcg(82),Vcg(83),Vcg(84),Vcg(85),Vcg(86),Vcg(87),Vcg(88),Vcg(89),Vcg(90), + Vcg(91),Vcg(92),Vcg(93),Vcg(94),Vcg(95),Vcg(96),Vcg(97),Vcg(98),Vcg(99), + Vcg(100),Vcg(101),Vcg(102),Vcg(103),Vcg(104),Vcg(105),Vcg(106),Vcg(107), + Vcg(108),Vcg(109),Vcg(110),Vcg(111),Vcg(112),Vcg(113),Vcg(114),Vcg(115), + Vcg(116),Vcg(117),Vcg(118),Vcg(119),Vcg(120),Vcg(121),Vcg(122),Vcg(123), + Vcg(124),Vcg(125),Vcg(126),Vcg(127),Vcg(128),Vcg(129),Vcg(130),Vcg(131), + Vcg(132),Vcg(133),Vcg(134),Vcg(135),Vcg(136),Vcg(137),Vcg(138),Vcg(139), + Vcg(140),Vcg(141),Vcg(142),Vcg(143),Vcg(144),Vcg(145),Vcg(146),Vcg(147), + Vcg(148),Vcg(149),Vcg(150),Vcg(151),Vcg(152),Vcg(153),Vcg(154),Vcg(155), + Vcg(156),Vcg(157),Vcg(158),Vcg(159),Vcg(160),Vcg(161),Vcg(162),Vcg(163), + Vcg(164),Vcg(165),Vcg(166),Vcg(167),Vcg(168),Vcg(169),Vcg(170),Vcg(171), + Vcg(172),Vcg(173),Vcg(174),Vcg(175),Vcg(176),Vcg(177),Vcg(178),Vcg(179), + Vcg(180),Vcg(181),Vcg(182),Vcg(183),Vcg(184),Vcg(185),Vcg(186),Vcg(187), + Vcg(188),Vcg(189),Vcg(190),Vcg(191),Vcg(192),Vcg(193),Vcg(194),Vcg(195), + Vcg(196),Vcg(197),Vcg(198),Vcg(199),Vcg(200),Vcg(201),Vcg(202),Vcg(203), + Vcg(204),Vcg(205),Vcg(206),Vcg(207),Vcg(208),Vcg(209),Vcg(210),Vcg(211), + Vcg(212),Vcg(213),Vcg(214),Vcg(215),Vcg(216),Vcg(217),Vcg(218),Vcg(219), + Vcg(220),Vcg(221),Vcg(222),Vcg(223),Vcg(224),Vcg(225),Vcg(226),Vcg(227), + Vcg(228),Vcg(229),Vcg(230),Vcg(231),Vcg(232),Vcg(233),Vcg(234),Vcg(235), + Vcg(236),Vcg(237),Vcg(238),Vcg(239),Vcg(240),Vcg(241),Vcg(242),Vcg(243), + Vcg(244),Vcg(245),Vcg(246),Vcg(247),Vcg(248),Vcg(249),Vcg(250),Vcg(251), + Vcg(252),Vcg(253),Vcg(254),Vcg(255)}; + +} // namespace libyuv + +#endif + diff --git a/files/source/convert.cc b/files/source/convert.cc new file mode 100644 index 00000000..8154dcb7 --- /dev/null +++ b/files/source/convert.cc @@ -0,0 +1,904 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/convert.h" + +#include "conversion_tables.h" +#include "libyuv/basic_types.h" +#include "libyuv/cpu_id.h" +#include "row.h" + +//#define SCALEOPT //Currently for windows only. June 2010 + +#ifdef SCALEOPT +#include <emmintrin.h> +#endif + +namespace libyuv { + +static inline uint8 Clip(int32 val) { + if (val < 0) { + return (uint8) 0; + } else if (val > 255){ + return (uint8) 255; + } + return (uint8) val; +} + +int I420ToRGB24(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height) { + if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) { + return -1; + } + + // RGB orientation - bottom up + // TODO(fbarchard): support inversion + uint8* out = dst_frame + dst_stride_frame * height - dst_stride_frame; + uint8* out2 = out - dst_stride_frame; + int h, w; + int tmp_r, tmp_g, tmp_b; + const uint8 *y1, *y2 ,*u, *v; + y1 = src_y; + y2 = y1 + src_stride_y; + u = src_u; + v = src_v; + for (h = ((height + 1) >> 1); h > 0; h--){ + // 2 rows at a time, 2 y's at a time + for (w = 0; w < ((width + 1) >> 1); w++){ + // Vertical and horizontal sub-sampling + tmp_r = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8); + tmp_g = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8); + tmp_b = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128) >> 8); + out[0] = Clip(tmp_b); + out[1] = Clip(tmp_g); + out[2] = Clip(tmp_r); + + tmp_r = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128) >> 8); + tmp_g = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8); + tmp_b = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8); + out[3] = Clip(tmp_b); + out[4] = Clip(tmp_g); + out[5] = Clip(tmp_r); + + tmp_r = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8); + tmp_g = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8); + tmp_b = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8); + out2[0] = Clip(tmp_b); + out2[1] = Clip(tmp_g); + out2[2] = Clip(tmp_r); + + tmp_r = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8); + tmp_g = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8); + tmp_b = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8); + out2[3] = Clip(tmp_b); + out2[4] = Clip(tmp_g); + out2[5] = Clip(tmp_r); + + out += 6; + out2 += 6; + y1 += 2; + y2 += 2; + u++; + v++; + } + y1 += src_stride_y + src_stride_y - width; + y2 += src_stride_y + src_stride_y - width; + u += src_stride_u - ((width + 1) >> 1); + v += src_stride_v - ((width + 1) >> 1); + out -= dst_stride_frame * 3; + out2 -= dst_stride_frame * 3; + } // end height for + return 0; +} + +// Little Endian... +int I420ToARGB4444(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height) { + if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) { + return -1; + } + + // RGB orientation - bottom up + uint8* out = dst_frame + dst_stride_frame * (height - 1); + uint8* out2 = out - dst_stride_frame; + int tmp_r, tmp_g, tmp_b; + const uint8 *y1,*y2, *u, *v; + y1 = src_y; + y2 = y1 + src_stride_y; + u = src_u; + v = src_v; + int h, w; + + for (h = ((height + 1) >> 1); h > 0; h--) { + // 2 rows at a time, 2 y's at a time + for (w = 0; w < ((width + 1) >> 1); w++) { + // Vertical and horizontal sub-sampling + // Convert to RGB888 and re-scale to 4 bits + tmp_r = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8); + tmp_g = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8); + tmp_b = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128) >> 8); + out[0] =(uint8)((Clip(tmp_g) & 0xf0) + (Clip(tmp_b) >> 4)); + out[1] = (uint8)(0xf0 + (Clip(tmp_r) >> 4)); + + tmp_r = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128) >> 8); + tmp_g = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8); + tmp_b = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8); + out[2] = (uint8)((Clip(tmp_g) & 0xf0 ) + (Clip(tmp_b) >> 4)); + out[3] = (uint8)(0xf0 + (Clip(tmp_r) >> 4)); + + tmp_r = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8); + tmp_g = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8); + tmp_b = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8); + out2[0] = (uint8)((Clip(tmp_g) & 0xf0 ) + (Clip(tmp_b) >> 4)); + out2[1] = (uint8) (0xf0 + (Clip(tmp_r) >> 4)); + + tmp_r = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8); + tmp_g = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8); + tmp_b = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8); + out2[2] = (uint8)((Clip(tmp_g) & 0xf0 ) + (Clip(tmp_b) >> 4)); + out2[3] = (uint8)(0xf0 + (Clip(tmp_r) >> 4)); + + out += 4; + out2 += 4; + y1 += 2; + y2 += 2; + u++; + v++; + } + y1 += 2 * src_stride_y - width; + y2 += 2 * src_stride_y - width; + u += src_stride_u - ((width + 1) >> 1); + v += src_stride_v - ((width + 1) >> 1); + out -= (dst_stride_frame + width) * 2; + out2 -= (dst_stride_frame + width) * 2; + } // end height for + return 0; +} + + +int I420ToRGB565(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height) { + if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + uint16* out = (uint16*)(dst_frame) + dst_stride_frame * (height - 1); + uint16* out2 = out - dst_stride_frame; + + int tmp_r, tmp_g, tmp_b; + const uint8* y1,* y2, * u, * v; + y1 = src_y; + y2 = y1 + src_stride_y; + u = src_u; + v = src_v; + int h, w; + + for (h = ((height + 1) >> 1); h > 0; h--){ + // 2 rows at a time, 2 y's at a time + for (w = 0; w < ((width + 1) >> 1); w++){ + // Vertical and horizontal sub-sampling + // 1. Convert to RGB888 + // 2. Shift to adequate location (in the 16 bit word) - RGB 565 + + tmp_r = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8); + tmp_g = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8); + tmp_b = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128) >> 8); + out[0] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g) + & 0xfc) << 3) + (Clip(tmp_b) >> 3); + + tmp_r = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128) >> 8); + tmp_g = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8); + tmp_b = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8); + out[1] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g) + & 0xfc) << 3) + (Clip(tmp_b ) >> 3); + + tmp_r = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8); + tmp_g = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8); + tmp_b = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8); + out2[0] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g) + & 0xfc) << 3) + (Clip(tmp_b) >> 3); + + tmp_r = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8); + tmp_g = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8); + tmp_b = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8); + out2[1] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g) + & 0xfc) << 3) + (Clip(tmp_b) >> 3); + + y1 += 2; + y2 += 2; + out += 2; + out2 += 2; + u++; + v++; + } + y1 += 2 * src_stride_y - width; + y2 += 2 * src_stride_y - width; + u += src_stride_u - ((width + 1) >> 1); + v += src_stride_v - ((width + 1) >> 1); + out -= 2 * dst_stride_frame + width; + out2 -= 2 * dst_stride_frame + width; + } + return 0; +} + + +int I420ToARGB1555(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height) { + if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) { + return -1; + } + uint16* out = (uint16*)(dst_frame) + dst_stride_frame * (height - 1); + uint16* out2 = out - dst_stride_frame ; + int32 tmp_r, tmp_g, tmp_b; + const uint8 *y1,*y2, *u, *v; + int h, w; + + y1 = src_y; + y2 = y1 + src_stride_y; + u = src_u; + v = src_v; + + for (h = ((height + 1) >> 1); h > 0; h--){ + // 2 rows at a time, 2 y's at a time + for (w = 0; w < ((width + 1) >> 1); w++){ + // Vertical and horizontal sub-sampling + // 1. Convert to RGB888 + // 2. Shift to adequate location (in the 16 bit word) - RGB 555 + // 3. Add 1 for alpha value + tmp_r = (int32)((mapYc[y1[0]] + mapVcr[v[0]] + 128) >> 8); + tmp_g = (int32)((mapYc[y1[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8); + tmp_b = (int32)((mapYc[y1[0]] + mapUcb[u[0]] + 128) >> 8); + out[0] = (uint16)(0x8000 + ((Clip(tmp_r) & 0xf8) << 10) + + ((Clip(tmp_g) & 0xf8) << 3) + (Clip(tmp_b) >> 3)); + + tmp_r = (int32)((mapYc[y1[1]] + mapVcr[v[0]] + 128) >> 8); + tmp_g = (int32)((mapYc[y1[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8); + tmp_b = (int32)((mapYc[y1[1]] + mapUcb[u[0]] + 128) >> 8); + out[1] = (uint16)(0x8000 + ((Clip(tmp_r) & 0xf8) << 10) + + ((Clip(tmp_g) & 0xf8) << 3) + (Clip(tmp_b) >> 3)); + + tmp_r = (int32)((mapYc[y2[0]] + mapVcr[v[0]] + 128) >> 8); + tmp_g = (int32)((mapYc[y2[0]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8); + tmp_b = (int32)((mapYc[y2[0]] + mapUcb[u[0]] + 128) >> 8); + out2[0] = (uint16)(0x8000 + ((Clip(tmp_r) & 0xf8) << 10) + + ((Clip(tmp_g) & 0xf8) << 3) + (Clip(tmp_b) >> 3)); + + tmp_r = (int32)((mapYc[y2[1]] + mapVcr[v[0]] + 128) >> 8); + tmp_g = (int32)((mapYc[y2[1]] + mapUcg[u[0]] + mapVcg[v[0]] + 128) >> 8); + tmp_b = (int32)((mapYc[y2[1]] + mapUcb[u[0]] + 128) >> 8); + out2[1] = (uint16)(0x8000 + ((Clip(tmp_r) & 0xf8) << 10) + + ((Clip(tmp_g) & 0xf8) << 3) + (Clip(tmp_b) >> 3)); + + y1 += 2; + y2 += 2; + out += 2; + out2 += 2; + u++; + v++; + } + y1 += 2 * src_stride_y - width; + y2 += 2 * src_stride_y - width; + u += src_stride_u - ((width + 1) >> 1); + v += src_stride_v - ((width + 1) >> 1); + out -= 2 * dst_stride_frame + width; + out2 -= 2 * dst_stride_frame + width; + } + return 0; +} + + +int I420ToYUY2(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height) { + if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) { + return -1; + } + + const uint8* in1 = src_y; + const uint8* in2 = src_y + src_stride_y; + + uint8* out1 = dst_frame; + uint8* out2 = dst_frame + dst_stride_frame; + + // YUY2 - Macro-pixel = 2 image pixels + // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... +#ifndef SCALEOPT + for (int i = 0; i < ((height + 1) >> 1); i++){ + for (int j = 0; j < ((width + 1) >> 1); j++){ + out1[0] = in1[0]; + out1[1] = *src_u; + out1[2] = in1[1]; + out1[3] = *src_v; + + out2[0] = in2[0]; + out2[1] = *src_u; + out2[2] = in2[1]; + out2[3] = *src_v; + out1 += 4; + out2 += 4; + src_u++; + src_v++; + in1 += 2; + in2 += 2; + } + in1 += 2 * src_stride_y - width; + in2 += 2 * src_stride_y - width; + src_u += src_stride_u - ((width + 1) >> 1); + src_v += src_stride_v - ((width + 1) >> 1); + out1 += dst_stride_frame + dst_stride_frame - 2 * width; + out2 += dst_stride_frame + dst_stride_frame - 2 * width; + } +#else + for (WebRtc_UWord32 i = 0; i < ((height + 1) >> 1);i++) { + int32 width__ = (width >> 4); + _asm + { + ;pusha + mov eax, DWORD PTR [in1] ;1939.33 + mov ecx, DWORD PTR [in2] ;1939.33 + mov ebx, DWORD PTR [src_u] ;1939.33 + mov edx, DWORD PTR [src_v] ;1939.33 + loop0: + movq xmm6, QWORD PTR [ebx] ;src_u + movq xmm0, QWORD PTR [edx] ;src_v + punpcklbw xmm6, xmm0 ;src_u, src_v mix + ;movdqa xmm1, xmm6 + ;movdqa xmm2, xmm6 + ;movdqa xmm4, xmm6 + + movdqu xmm3, XMMWORD PTR [eax] ;in1 + movdqa xmm1, xmm3 + punpcklbw xmm1, xmm6 ;in1, src_u, in1, src_v + mov esi, DWORD PTR [out1] + movdqu XMMWORD PTR [esi], xmm1 ;write to out1 + + movdqu xmm5, XMMWORD PTR [ecx] ;in2 + movdqa xmm2, xmm5 + punpcklbw xmm2, xmm6 ;in2, src_u, in2, src_v + mov edi, DWORD PTR [out2] + movdqu XMMWORD PTR [edi], xmm2 ;write to out2 + + punpckhbw xmm3, xmm6 ;in1, src_u, in1, src_v again + movdqu XMMWORD PTR [esi+16], xmm3 ;write to out1 again + add esi, 32 + mov DWORD PTR [out1], esi + + punpckhbw xmm5, xmm6 ;src_u, in2, src_v again + movdqu XMMWORD PTR [edi+16], xmm5 ;write to out2 again + add edi, 32 + mov DWORD PTR [out2], edi + + add ebx, 8 + add edx, 8 + add eax, 16 + add ecx, 16 + + mov esi, DWORD PTR [width__] + sub esi, 1 + mov DWORD PTR [width__], esi + jg loop0 + + mov DWORD PTR [in1], eax ;1939.33 + mov DWORD PTR [in2], ecx ;1939.33 + mov DWORD PTR [src_u], ebx ;1939.33 + mov DWORD PTR [src_v], edx ;1939.33 + + ;popa + emms + } + in1 += 2 * src_stride_y - width; + in2 += 2 * src_stride_y - width; + out1 += dst_stride_frame + dst_stride_frame - 2 * width; + out2 += dst_stride_frame + dst_stride_frame - 2 * width; + } +#endif + return 0; +} + +int I420ToUYVY(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height) { + if (src_y == NULL || src_u == NULL || src_v == NULL || dst_frame == NULL) { + return -1; + } + + int i = 0; + const uint8* y1 = src_y; + const uint8* y2 = y1 + src_stride_y; + const uint8* u = src_u; + const uint8* v = src_v; + + uint8* out1 = dst_frame; + uint8* out2 = dst_frame + dst_stride_frame; + + // Macro-pixel = 2 image pixels + // U0Y0V0Y1....U2Y2V2Y3...U4Y4V4Y5..... + +#ifndef SCALEOPT + for (; i < ((height + 1) >> 1); i++) { + for (int j = 0; j < ((width + 1) >> 1); j++) { + out1[0] = *u; + out1[1] = y1[0]; + out1[2] = *v; + out1[3] = y1[1]; + + out2[0] = *u; + out2[1] = y2[0]; + out2[2] = *v; + out2[3] = y2[1]; + out1 += 4; + out2 += 4; + u++; + v++; + y1 += 2; + y2 += 2; + } + y1 += 2 * src_stride_y - width; + y2 += 2 * src_stride_y - width; + u += src_stride_u - ((width + 1) >> 1); + v += src_stride_v - ((width + 1) >> 1); + out1 += 2 * (dst_stride_frame - width); + out2 += 2 * (dst_stride_frame - width); + } +#else + for (; i < (height >> 1);i++) { + int32 width__ = (width >> 4); + _asm + { + ;pusha + mov eax, DWORD PTR [in1] ;1939.33 + mov ecx, DWORD PTR [in2] ;1939.33 + mov ebx, DWORD PTR [src_u] ;1939.33 + mov edx, DWORD PTR [src_v] ;1939.33 +loop0: + movq xmm6, QWORD PTR [ebx] ;src_u + movq xmm0, QWORD PTR [edx] ;src_v + punpcklbw xmm6, xmm0 ;src_u, src_v mix + movdqa xmm1, xmm6 + movdqa xmm2, xmm6 + movdqa xmm4, xmm6 + + movdqu xmm3, XMMWORD PTR [eax] ;in1 + punpcklbw xmm1, xmm3 ;src_u, in1, src_v + mov esi, DWORD PTR [out1] + movdqu XMMWORD PTR [esi], xmm1 ;write to out1 + + movdqu xmm5, XMMWORD PTR [ecx] ;in2 + punpcklbw xmm2, xmm5 ;src_u, in2, src_v + mov edi, DWORD PTR [out2] + movdqu XMMWORD PTR [edi], xmm2 ;write to out2 + + punpckhbw xmm4, xmm3 ;src_u, in1, src_v again + movdqu XMMWORD PTR [esi+16], xmm4 ;write to out1 again + add esi, 32 + mov DWORD PTR [out1], esi + + punpckhbw xmm6, xmm5 ;src_u, in2, src_v again + movdqu XMMWORD PTR [edi+16], xmm6 ;write to out2 again + add edi, 32 + mov DWORD PTR [out2], edi + + add ebx, 8 + add edx, 8 + add eax, 16 + add ecx, 16 + + mov esi, DWORD PTR [width__] + sub esi, 1 + mov DWORD PTR [width__], esi + jg loop0 + + mov DWORD PTR [in1], eax ;1939.33 + mov DWORD PTR [in2], ecx ;1939.33 + mov DWORD PTR [src_u], ebx ;1939.33 + mov DWORD PTR [src_v], edx ;1939.33 + + ;popa + emms + } + in1 += width; + in2 += width; + out1 += 2 * (dst_stride_frame - width); + out2 += 2 * (dst_stride_frame - width); + } +#endif + return 0; +} + + +int NV12ToRGB565(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_frame, int dst_stride_frame, + int width, int height) { + if (src_y == NULL || src_uv == NULL || dst_frame == NULL) { + return -1; + } + + // Bi-Planar: Y plane followed by an interlaced U and V plane + const uint8* interlacedSrc = src_uv; + uint16* out = (uint16*)(src_y) + dst_stride_frame * (height - 1); + uint16* out2 = out - dst_stride_frame; + int32 tmp_r, tmp_g, tmp_b; + const uint8 *y1,*y2; + y1 = src_y; + y2 = y1 + src_stride_y; + int h, w; + + for (h = ((height + 1) >> 1); h > 0; h--) { + // 2 rows at a time, 2 y's at a time + for (w = 0; w < ((width + 1) >> 1); w++) { + // Vertical and horizontal sub-sampling + // 1. Convert to RGB888 + // 2. Shift to adequate location (in the 16 bit word) - RGB 565 + + tmp_r = (int32)((mapYc[y1[0]] + mapVcr[interlacedSrc[1]] + 128) >> 8); + tmp_g = (int32)((mapYc[y1[0]] + mapUcg[interlacedSrc[0]] + + mapVcg[interlacedSrc[1]] + 128) >> 8); + tmp_b = (int32)((mapYc[y1[0]] + mapUcb[interlacedSrc[0]] + 128) >> 8); + out[0] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g) + & 0xfc) << 3) + (Clip(tmp_b) >> 3); + + tmp_r = (int32)((mapYc[y1[1]] + mapVcr[interlacedSrc[1]] + 128) >> 8); + tmp_g = (int32)((mapYc[y1[1]] + mapUcg[interlacedSrc[0]] + + mapVcg[interlacedSrc[1]] + 128) >> 8); + tmp_b = (int32)((mapYc[y1[1]] + mapUcb[interlacedSrc[0]] + 128) >> 8); + out[1] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g) + & 0xfc) << 3) + (Clip(tmp_b ) >> 3); + + tmp_r = (int32)((mapYc[y2[0]] + mapVcr[interlacedSrc[1]] + 128) >> 8); + tmp_g = (int32)((mapYc[y2[0]] + mapUcg[interlacedSrc[0]] + + mapVcg[interlacedSrc[1]] + 128) >> 8); + tmp_b = (int32)((mapYc[y2[0]] + mapUcb[interlacedSrc[0]] + 128) >> 8); + out2[0] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g) + & 0xfc) << 3) + (Clip(tmp_b) >> 3); + + tmp_r = (int32)((mapYc[y2[1]] + mapVcr[interlacedSrc[1]] + + 128) >> 8); + tmp_g = (int32)((mapYc[y2[1]] + mapUcg[interlacedSrc[0]] + + mapVcg[interlacedSrc[1]] + 128) >> 8); + tmp_b = (int32)((mapYc[y2[1]] + mapUcb[interlacedSrc[0]] + 128) >> 8); + out2[1] = (uint16)((Clip(tmp_r) & 0xf8) << 8) + ((Clip(tmp_g) + & 0xfc) << 3) + (Clip(tmp_b) >> 3); + + y1 += 2; + y2 += 2; + out += 2; + out2 += 2; + interlacedSrc += 2; + } + y1 += 2 * src_stride_y - width; + y2 += 2 * src_stride_y - width; + interlacedSrc += src_stride_uv - ((width + 1) >> 1); + out -= 3 * dst_stride_frame + dst_stride_frame - width; + out2 -= 3 * dst_stride_frame + dst_stride_frame - width; + } + return 0; +} + +// TODO(fbarchard): Deprecated - this is same as BG24ToARGB with -height +int RGB24ToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_frame, int dst_stride_frame, + int width, int height) { + if (src_frame == NULL || dst_frame == NULL) { + return -1; + } + + int i, j, offset; + uint8* outFrame = dst_frame; + const uint8* inFrame = src_frame; + + outFrame += dst_stride_frame * (height - 1) * 4; + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + offset = j * 4; + outFrame[0 + offset] = inFrame[0]; + outFrame[1 + offset] = inFrame[1]; + outFrame[2 + offset] = inFrame[2]; + outFrame[3 + offset] = 0xff; + inFrame += 3; + } + outFrame -= 4 * (dst_stride_frame - width); + inFrame += src_stride_frame - width; + } + return 0; +} + +int ARGBToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + if (height < 0) { + height = -height; + src_frame = src_frame + (height - 1) * src_stride_frame; + src_stride_frame = -src_stride_frame; + } + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix); + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +#if defined(HAS_ARGBTOYROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 16 == 0) && + IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && + IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } else +#endif + { + ARGBToYRow = ARGBToYRow_C; + } +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 16 == 0) && + IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && + IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && + IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } else +#endif + { + ARGBToUVRow = ARGBToUVRow_C; + } + + for (int y = 0; y < (height - 1); y += 2) { + ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width); + ARGBToYRow(src_frame, dst_y, width); + ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width); + src_frame += src_stride_frame * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + ARGBToUVRow(src_frame, 0, dst_u, dst_v, width); + ARGBToYRow(src_frame, dst_y, width); + } + return 0; +} + +int BGRAToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + if (height < 0) { + height = -height; + src_frame = src_frame + (height - 1) * src_stride_frame; + src_stride_frame = -src_stride_frame; + } + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix); + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +#if defined(HAS_BGRATOYROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 16 == 0) && + IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && + IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { + ARGBToYRow = BGRAToYRow_SSSE3; + } else +#endif + { + ARGBToYRow = BGRAToYRow_C; + } +#if defined(HAS_BGRATOUVROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 16 == 0) && + IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && + IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && + IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) { + ARGBToUVRow = BGRAToUVRow_SSSE3; + } else +#endif + { + ARGBToUVRow = BGRAToUVRow_C; + } + + for (int y = 0; y < (height - 1); y += 2) { + ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width); + ARGBToYRow(src_frame, dst_y, width); + ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width); + src_frame += src_stride_frame * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + ARGBToUVRow(src_frame, 0, dst_u, dst_v, width); + ARGBToYRow(src_frame, dst_y, width); + } + return 0; +} + +int ABGRToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + if (height < 0) { + height = -height; + src_frame = src_frame + (height - 1) * src_stride_frame; + src_stride_frame = -src_stride_frame; + } + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix); + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +#if defined(HAS_ABGRTOYROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 16 == 0) && + IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && + IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { + ARGBToYRow = ABGRToYRow_SSSE3; + } else +#endif + { + ARGBToYRow = ABGRToYRow_C; + } +#if defined(HAS_ABGRTOUVROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 16 == 0) && + IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && + IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && + IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) { + ARGBToUVRow = ABGRToUVRow_SSSE3; + } else +#endif + { + ARGBToUVRow = ABGRToUVRow_C; + } + + for (int y = 0; y < (height - 1); y += 2) { + ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width); + ARGBToYRow(src_frame, dst_y, width); + ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width); + src_frame += src_stride_frame * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + ARGBToUVRow(src_frame, 0, dst_u, dst_v, width); + ARGBToYRow(src_frame, dst_y, width); + } + return 0; +} + +int RGB24ToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + if (height < 0) { + height = -height; + src_frame = src_frame + (height - 1) * src_stride_frame; + src_stride_frame = -src_stride_frame; + } + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix); + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +#if defined(HAS_RGB24TOYROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 16 == 0) && + IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && + IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { + ARGBToYRow = RGB24ToYRow_SSSE3; + } else +#endif + { + ARGBToYRow = RGB24ToYRow_C; + } +#if defined(HAS_RGB24TOUVROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 16 == 0) && + IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && + IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && + IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) { + ARGBToUVRow = RGB24ToUVRow_SSSE3; + } else +#endif + { + ARGBToUVRow = RGB24ToUVRow_C; + } + + for (int y = 0; y < (height - 1); y += 2) { + ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width); + ARGBToYRow(src_frame, dst_y, width); + ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width); + src_frame += src_stride_frame * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + ARGBToUVRow(src_frame, 0, dst_u, dst_v, width); + ARGBToYRow(src_frame, dst_y, width); + } + return 0; +} + +int RAWToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + if (height < 0) { + height = -height; + src_frame = src_frame + (height - 1) * src_stride_frame; + src_stride_frame = -src_stride_frame; + } + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix); + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +#if defined(HAS_RAWTOYROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 16 == 0) && + IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && + IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { + ARGBToYRow = RAWToYRow_SSSE3; + } else +#endif + { + ARGBToYRow = RAWToYRow_C; + } +#if defined(HAS_RAWTOUVROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 16 == 0) && + IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && + IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && + IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) { + ARGBToUVRow = RAWToUVRow_SSSE3; + } else +#endif + { + ARGBToUVRow = RAWToUVRow_C; + } + + for (int y = 0; y < (height - 1); y += 2) { + ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width); + ARGBToYRow(src_frame, dst_y, width); + ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width); + src_frame += src_stride_frame * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + ARGBToUVRow(src_frame, 0, dst_u, dst_v, width); + ARGBToYRow(src_frame, dst_y, width); + } + return 0; +} + +} // namespace libyuv diff --git a/files/source/cpu_id.cc b/files/source/cpu_id.cc new file mode 100644 index 00000000..cc44e215 --- /dev/null +++ b/files/source/cpu_id.cc @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/cpu_id.h" +#include "libyuv/basic_types.h" // for CPU_X86 + +#ifdef _MSC_VER +#include <intrin.h> +#endif + +// TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux. +#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__) +static inline void __cpuid(int cpu_info[4], int info_type) { + __asm__ volatile ( + "mov %%ebx, %%edi\n" + "cpuid\n" + "xchg %%edi, %%ebx\n" + : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) + : "a"(info_type) + ); +} +#elif defined(__i386__) || defined(__x86_64__) +static inline void __cpuid(int cpu_info[4], int info_type) { + __asm__ volatile ( + "cpuid\n" + : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) + : "a"(info_type) + ); +} +#endif + +namespace libyuv { + +// CPU detect function for SIMD instruction sets. +static int cpu_info_ = 0; + +// TODO(fbarchard): (cpu_info[2] & 0x10000000 ? kCpuHasAVX : 0) +static void InitCpuFlags() { +#ifdef CPU_X86 + int cpu_info[4]; + __cpuid(cpu_info, 1); + cpu_info_ = (cpu_info[3] & 0x04000000 ? kCpuHasSSE2 : 0) | + (cpu_info[2] & 0x00000200 ? kCpuHasSSSE3 : 0) | + kCpuInitialized; +#elif defined(__ARM_NEON__) + // gcc -mfpu=neon defines __ARM_NEON__ + // Enable Neon if you want support for Neon and Arm, and use MaskCpuFlags + // to disable Neon on devices that do not have it. + cpu_info_ = kCpuHasNEON | kCpuInitialized; +#else + cpu_info_ = kCpuInitialized; +#endif +} + +void MaskCpuFlags(int enable_flags) { + InitCpuFlags(); + cpu_info_ &= enable_flags; +} + +bool TestCpuFlag(int flag) { + if (0 == cpu_info_) { + InitCpuFlags(); + } + return cpu_info_ & flag ? true : false; +} + +} // namespace libyuv diff --git a/files/source/format_conversion.cc b/files/source/format_conversion.cc new file mode 100644 index 00000000..958f44c4 --- /dev/null +++ b/files/source/format_conversion.cc @@ -0,0 +1,423 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> + +#include "libyuv/cpu_id.h" +#include "video_common.h" +#include "row.h" + +#define kMaxStride (2048 * 4) + +namespace libyuv { + +// Note: to do this with Neon vld4.8 would load ARGB values into 4 registers +// and vst would select which 2 components to write. The low level would need +// to be ARGBToBG, ARGBToGB, ARGBToRG, ARGBToGR + +#if defined(WIN32) && !defined(COVERAGE_ENABLED) +#define HAS_ARGBTOBAYERROW_SSSE3 +__declspec(naked) +static void ARGBToBayerRow_SSSE3(const uint8* src_argb, + uint8* dst_bayer, uint32 selector, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_bayer + movd xmm7, [esp + 12] // selector + mov ecx, [esp + 16] // pix + pshufd xmm7, xmm7, 0 + + wloop: + movdqa xmm0, [eax] + lea eax, [eax + 16] + pshufb xmm0, xmm7 + movd [edx], xmm0 + lea edx, [edx + 4] + sub ecx, 4 + ja wloop + ret + } +} + +#elif (defined(__x86_64__) || defined(__i386__)) && \ + !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) + +#define HAS_ARGBTOBAYERROW_SSSE3 +static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix) { + asm volatile( + "movd %3,%%xmm7\n" + "pshufd $0x0,%%xmm7,%%xmm7\n" +"1:" + "movdqa (%0),%%xmm0\n" + "lea 0x10(%0),%0\n" + "pshufb %%xmm7,%%xmm0\n" + "movd %%xmm0,(%1)\n" + "lea 0x4(%1),%1\n" + "sub $0x4,%2\n" + "ja 1b\n" + : "+r"(src_argb), // %0 + "+r"(dst_bayer), // %1 + "+r"(pix) // %2 + : "r"(selector) // %3 + : "memory" +); +} +#endif + +static void ARGBToBayerRow_C(const uint8* src_argb, + uint8* dst_bayer, uint32 selector, int pix) { + int index0 = selector & 0xff; + int index1 = (selector >> 8) & 0xff; + // Copy a row of Bayer. + for (int x = 0; x < (pix - 1); x += 2) { + dst_bayer[0] = src_argb[index0]; + dst_bayer[1] = src_argb[index1]; + src_argb += 8; + dst_bayer += 2; + } + if (pix & 1) { + dst_bayer[0] = src_argb[index0]; + } +} + +// generate a selector mask useful for pshufb +static uint32 GenerateSelector(int select0, int select1) { + return static_cast<uint32>(select0) | + static_cast<uint32>((select1 + 4) << 8) | + static_cast<uint32>((select0 + 8) << 16) | + static_cast<uint32>((select1 + 12) << 24); +} + +// Converts 32 bit ARGB to any Bayer RGB format. +int ARGBToBayerRGB(const uint8* src_rgb, int src_stride_rgb, + uint8* dst_bayer, int dst_stride_bayer, + uint32 dst_fourcc_bayer, + int width, int height) { + if (height < 0) { + height = -height; + src_rgb = src_rgb + (height - 1) * src_stride_rgb; + src_stride_rgb = -src_stride_rgb; + } + void (*ARGBToBayerRow)(const uint8* src_argb, + uint8* dst_bayer, uint32 selector, int pix); +#if defined(HAS_ARGBTOBAYERROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 4 == 0) && + IS_ALIGNED(src_rgb, 16) && (src_stride_rgb % 16 == 0) && + IS_ALIGNED(dst_bayer, 4) && (dst_stride_bayer % 4 == 0)) { + ARGBToBayerRow = ARGBToBayerRow_SSSE3; + } else +#endif + { + ARGBToBayerRow = ARGBToBayerRow_C; + } + + int blue_index = 0; + int green_index = 1; + int red_index = 2; + + // Now build a lookup table containing the indices for the four pixels in each + // 2x2 Bayer grid. + uint32 index_map[2]; + switch (dst_fourcc_bayer) { + default: + assert(false); + case FOURCC_RGGB: + index_map[0] = GenerateSelector(red_index, green_index); + index_map[1] = GenerateSelector(green_index, blue_index); + break; + case FOURCC_BGGR: + index_map[0] = GenerateSelector(blue_index, green_index); + index_map[1] = GenerateSelector(green_index, red_index); + break; + case FOURCC_GRBG: + index_map[0] = GenerateSelector(green_index, red_index); + index_map[1] = GenerateSelector(blue_index, green_index); + break; + case FOURCC_GBRG: + index_map[0] = GenerateSelector(green_index, blue_index); + index_map[1] = GenerateSelector(red_index, green_index); + break; + } + + // Now convert. + for (int y = 0; y < height; ++y) { + ARGBToBayerRow(src_rgb, dst_bayer, index_map[y & 1], width); + src_rgb += src_stride_rgb; + dst_bayer += dst_stride_bayer; + } + return 0; +} + +#define AVG(a,b) (((a) + (b)) >> 1) + +static void BayerRowBG(const uint8* src_bayer0, int src_stride_bayer, + uint8* dst_rgb, int pix) { + const uint8* src_bayer1 = src_bayer0 + src_stride_bayer; + uint8 g = src_bayer0[1]; + uint8 r = src_bayer1[1]; + for (int x = 0; x < (pix - 2); x += 2) { + dst_rgb[0] = src_bayer0[0]; + dst_rgb[1] = AVG(g, src_bayer0[1]); + dst_rgb[2] = AVG(r, src_bayer1[1]); + dst_rgb[3] = 255U; + dst_rgb[4] = AVG(src_bayer0[0], src_bayer0[2]); + dst_rgb[5] = src_bayer0[1]; + dst_rgb[6] = src_bayer1[1]; + dst_rgb[7] = 255U; + g = src_bayer0[1]; + r = src_bayer1[1]; + src_bayer0 += 2; + src_bayer1 += 2; + dst_rgb += 8; + } + dst_rgb[0] = src_bayer0[0]; + dst_rgb[1] = AVG(g, src_bayer0[1]); + dst_rgb[2] = AVG(r, src_bayer1[1]); + dst_rgb[3] = 255U; + dst_rgb[4] = src_bayer0[0]; + dst_rgb[5] = src_bayer0[1]; + dst_rgb[6] = src_bayer1[1]; + dst_rgb[7] = 255U; +} + +static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer, + uint8* dst_rgb, int pix) { + const uint8* src_bayer1 = src_bayer0 + src_stride_bayer; + uint8 g = src_bayer0[1]; + uint8 b = src_bayer1[1]; + for (int x = 0; x < (pix - 2); x += 2) { + dst_rgb[0] = AVG(b, src_bayer1[1]); + dst_rgb[1] = AVG(g, src_bayer0[1]); + dst_rgb[2] = src_bayer0[0]; + dst_rgb[3] = 255U; + dst_rgb[4] = src_bayer1[1]; + dst_rgb[5] = src_bayer0[1]; + dst_rgb[6] = AVG(src_bayer0[0], src_bayer0[2]); + dst_rgb[7] = 255U; + g = src_bayer0[1]; + b = src_bayer1[1]; + src_bayer0 += 2; + src_bayer1 += 2; + dst_rgb += 8; + } + dst_rgb[0] = AVG(b, src_bayer1[1]); + dst_rgb[1] = AVG(g, src_bayer0[1]); + dst_rgb[2] = src_bayer0[0]; + dst_rgb[3] = 255U; + dst_rgb[4] = src_bayer1[1]; + dst_rgb[5] = src_bayer0[1]; + dst_rgb[6] = src_bayer0[0]; + dst_rgb[7] = 255U; +} + +static void BayerRowGB(const uint8* src_bayer0, int src_stride_bayer, + uint8* dst_rgb, int pix) { + const uint8* src_bayer1 = src_bayer0 + src_stride_bayer; + uint8 b = src_bayer0[1]; + for (int x = 0; x < (pix - 2); x += 2) { + dst_rgb[0] = AVG(b, src_bayer0[1]); + dst_rgb[1] = src_bayer0[0]; + dst_rgb[2] = src_bayer1[0]; + dst_rgb[3] = 255U; + dst_rgb[4] = src_bayer0[1]; + dst_rgb[5] = AVG(src_bayer0[0], src_bayer0[2]); + dst_rgb[6] = AVG(src_bayer1[0], src_bayer1[2]); + dst_rgb[7] = 255U; + b = src_bayer0[1]; + src_bayer0 += 2; + src_bayer1 += 2; + dst_rgb += 8; + } + dst_rgb[0] = AVG(b, src_bayer0[1]); + dst_rgb[1] = src_bayer0[0]; + dst_rgb[2] = src_bayer1[0]; + dst_rgb[3] = 255U; + dst_rgb[4] = src_bayer0[1]; + dst_rgb[5] = src_bayer0[0]; + dst_rgb[6] = src_bayer1[0]; + dst_rgb[7] = 255U; +} + +static void BayerRowGR(const uint8* src_bayer0, int src_stride_bayer, + uint8* dst_rgb, int pix) { + const uint8* src_bayer1 = src_bayer0 + src_stride_bayer; + uint8 r = src_bayer0[1]; + for (int x = 0; x < (pix - 2); x += 2) { + dst_rgb[0] = src_bayer1[0]; + dst_rgb[1] = src_bayer0[0]; + dst_rgb[2] = AVG(r, src_bayer0[1]); + dst_rgb[3] = 255U; + dst_rgb[4] = AVG(src_bayer1[0], src_bayer1[2]); + dst_rgb[5] = AVG(src_bayer0[0], src_bayer0[2]); + dst_rgb[6] = src_bayer0[1]; + dst_rgb[7] = 255U; + r = src_bayer0[1]; + src_bayer0 += 2; + src_bayer1 += 2; + dst_rgb += 8; + } + dst_rgb[0] = src_bayer1[0]; + dst_rgb[1] = src_bayer0[0]; + dst_rgb[2] = AVG(r, src_bayer0[1]); + dst_rgb[3] = 255U; + dst_rgb[4] = src_bayer1[0]; + dst_rgb[5] = src_bayer0[0]; + dst_rgb[6] = src_bayer0[1]; + dst_rgb[7] = 255U; +} + +// Converts any Bayer RGB format to ARGB. +int BayerRGBToARGB(const uint8* src_bayer, int src_stride_bayer, + uint32 src_fourcc_bayer, + uint8* dst_rgb, int dst_stride_rgb, + int width, int height) { + if (height < 0) { + height = -height; + dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb; + dst_stride_rgb = -dst_stride_rgb; + } + void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_rgb, int pix); + void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_rgb, int pix); + + switch (src_fourcc_bayer) { + default: + assert(false); + case FOURCC_RGGB: + BayerRow0 = BayerRowRG; + BayerRow1 = BayerRowGB; + break; + case FOURCC_BGGR: + BayerRow0 = BayerRowBG; + BayerRow1 = BayerRowGR; + break; + case FOURCC_GRBG: + BayerRow0 = BayerRowGR; + BayerRow1 = BayerRowBG; + break; + case FOURCC_GBRG: + BayerRow0 = BayerRowGB; + BayerRow1 = BayerRowRG; + break; + } + + for (int y = 0; y < (height - 1); y += 2) { + BayerRow0(src_bayer, src_stride_bayer, dst_rgb, width); + BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer, + dst_rgb + dst_stride_rgb, width); + src_bayer += src_stride_bayer * 2; + dst_rgb += dst_stride_rgb * 2; + } + if (height & 1) { + BayerRow0(src_bayer, -src_stride_bayer, dst_rgb, width); + } + return 0; +} + +// Converts any Bayer RGB format to ARGB. +int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer, + uint32 src_fourcc_bayer, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + if (width * 4 > kMaxStride) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + int halfheight = (height + 1) >> 1; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_u = dst_u + (halfheight - 1) * dst_stride_u; + dst_v = dst_v + (halfheight - 1) * dst_stride_v; + dst_stride_y = -dst_stride_y; + dst_stride_u = -dst_stride_u; + dst_stride_v = -dst_stride_v; + } + void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_rgb, int pix); + void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_rgb, int pix); + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix); + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); + SIMD_ALIGNED(uint8 row[kMaxStride * 2]); + +#if defined(HAS_ARGBTOYROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 16 == 0) && + IS_ALIGNED(row, 16) && (kMaxStride % 16 == 0) && + IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } else +#endif + { + ARGBToYRow = ARGBToYRow_C; + } +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 16 == 0) && + IS_ALIGNED(row, 16) && (kMaxStride % 16 == 0) && + IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && + IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } else +#endif + { + ARGBToUVRow = ARGBToUVRow_C; + } + + switch (src_fourcc_bayer) { + default: + assert(false); + case FOURCC_RGGB: + BayerRow0 = BayerRowRG; + BayerRow1 = BayerRowGB; + break; + case FOURCC_BGGR: + BayerRow0 = BayerRowBG; + BayerRow1 = BayerRowGR; + break; + case FOURCC_GRBG: + BayerRow0 = BayerRowGR; + BayerRow1 = BayerRowBG; + break; + case FOURCC_GBRG: + BayerRow0 = BayerRowGB; + BayerRow1 = BayerRowRG; + break; + } + + for (int y = 0; y < (height - 1); y += 2) { + BayerRow0(src_bayer, src_stride_bayer, row, width); + BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer, + row + kMaxStride, width); + ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width); + src_bayer += src_stride_bayer * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + // TODO(fbarchard): Make sure this filters properly + if (height & 1) { + BayerRow0(src_bayer, src_stride_bayer, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + } + return 0; +} + +} // namespace libyuv diff --git a/files/source/general.cc b/files/source/general.cc new file mode 100644 index 00000000..9d39f9bf --- /dev/null +++ b/files/source/general.cc @@ -0,0 +1,284 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/general.h" + +#include <string.h> // memcpy(), memset() + +#include "libyuv/planar_functions.h" + +namespace libyuv { + +int +I420Mirror(const uint8* src_yplane, int src_ystride, + const uint8* src_uplane, int src_ustride, + const uint8* src_vplane, int src_vstride, + uint8* dst_yplane, int dst_ystride, + uint8* dst_uplane, int dst_ustride, + uint8* dst_vplane, int dst_vstride, + int width, int height) { + if (src_yplane == NULL || src_uplane == NULL || src_vplane == NULL || + dst_yplane == NULL || dst_uplane == NULL || dst_vplane == NULL) { + return -1; + } + + int indO = 0; + int indS = 0; + int wind, hind; + uint8 tmpVal, tmpValU, tmpValV; + // Will swap two values per iteration + const int halfWidth = (width + 1) >> 1; + + // Y + for (wind = 0; wind < halfWidth; wind++) { + for (hind = 0; hind < height; hind++) { + indO = hind * src_ystride + wind; + indS = hind * dst_ystride + (width - wind - 1); + tmpVal = src_yplane[indO]; + dst_yplane[indO] = src_yplane[indS]; + dst_yplane[indS] = tmpVal; + } + } + + const int halfHeight = (height + 1) >> 1; + const int halfSrcuvStride = (height + 1) >> 1; + const int halfuvWidth = (width + 1) >> 2; + + for (wind = 0; wind < halfuvWidth; wind++) { + for (hind = 0; hind < halfHeight; hind++) { + indO = hind * halfSrcuvStride + wind; + indS = hind * halfSrcuvStride + (halfuvWidth - wind - 1); + // U + tmpValU = src_uplane[indO]; + dst_uplane[indO] = src_uplane[indS]; + dst_uplane[indS] = tmpValU; + // V + tmpValV = src_vplane[indO]; + dst_vplane[indO] = src_vplane[indS]; + dst_vplane[indS] = tmpValV; + } + } + return 0; +} + +// Make a center cut +int +I420Crop(uint8* frame, + int src_width, int src_height, + int dst_width, int dst_height) +{ + if (frame == NULL) + return -1; + + if (src_width == dst_width && src_height == dst_height) { + // Nothing to do + return 3 * dst_height * dst_width / 2; + } + if (dst_width > src_width || dst_height > src_height) { + // error + return -1; + } + int i = 0; + int m = 0; + int loop = 0; + int half_dst_width = dst_width / 2; + int halfdst_height = dst_height / 2; + int halfsrc_width = src_width / 2; + int half_dst_height= src_height / 2; + int crop_height = ( src_height - dst_height ) / 2; + int crop_width = ( src_width - dst_width ) / 2; + + for (i = src_width * crop_height + crop_width; loop < dst_height ; + loop++, i += src_width) { + memcpy(&frame[m],&frame[i],dst_width); + m += dst_width; + } + i = src_width * src_height; // ilum + loop = 0; + for ( i += (halfsrc_width * crop_height / 2 + crop_width / 2); + loop < halfdst_height; loop++,i += halfsrc_width) { + memcpy(&frame[m],&frame[i],half_dst_width); + m += half_dst_width; + } + loop = 0; + i = src_width * src_height + half_dst_height * halfsrc_width; // ilum + Cr + for ( i += (halfsrc_width * crop_height / 2 + crop_width / 2); + loop < halfdst_height; loop++, i += halfsrc_width) { + memcpy(&frame[m],&frame[i],half_dst_width); + m += half_dst_width; + } + return 0; +} + + +int +I420CropPad(const uint8* src_frame, int src_width, + int src_height, uint8* dst_frame, + int dst_width, int dst_height) +{ + if (src_width < 1 || dst_width < 1 || src_height < 1 || dst_height < 1) { + return -1; + } + if (src_width == dst_width && src_height == dst_height) { + memcpy(dst_frame, src_frame, 3 * dst_width * (dst_height >> 1)); + } else { + if (src_height < dst_height) { + // pad height + int pad_height = dst_height - src_height; + int i = 0; + int pad_width = 0; + int crop_width = 0; + int width = src_width; + if (src_width < dst_width) { + // pad width + pad_width = dst_width - src_width; + } else { + // cut width + crop_width = src_width - dst_width; + width = dst_width; + } + if (pad_height) { + memset(dst_frame, 0, dst_width * (pad_height >> 1)); + dst_frame += dst_width * (pad_height >> 1); + } + for (i = 0; i < src_height;i++) { + if (pad_width) { + memset(dst_frame, 0, pad_width / 2); + dst_frame += pad_width / 2; + } + src_frame += crop_width >> 1; // in case we have a cut + memcpy(dst_frame,src_frame ,width); + src_frame += crop_width >> 1; + dst_frame += width; + src_frame += width; + if (pad_width) { + memset(dst_frame, 0, pad_width / 2); + dst_frame += pad_width / 2; + } + } + if (pad_height) { + memset(dst_frame, 0, dst_width * (pad_height >> 1)); + dst_frame += dst_width * (pad_height >> 1); + } + if (pad_height) { + memset(dst_frame, 127, (dst_width >> 2) * (pad_height >> 1)); + dst_frame += (dst_width >> 2) * (pad_height >> 1); + } + for (i = 0; i < (src_height >> 1); i++) { + if (pad_width) { + memset(dst_frame, 127, pad_width >> 2); + dst_frame += pad_width >> 2; + } + src_frame += crop_width >> 2; // in case we have a cut + memcpy(dst_frame, src_frame,width >> 1); + src_frame += crop_width >> 2; + dst_frame += width >> 1; + src_frame += width >> 1; + if (pad_width) { + memset(dst_frame, 127, pad_width >> 2); + dst_frame += pad_width >> 2; + } + } + if (pad_height) { + memset(dst_frame, 127, (dst_width >> 1) * (pad_height >> 1)); + dst_frame += (dst_width >> 1) * (pad_height >> 1); + } + for (i = 0; i < (src_height >> 1); i++) { + if (pad_width) { + memset(dst_frame, 127, pad_width >> 2); + dst_frame += pad_width >> 2; + } + src_frame += crop_width >> 2; // in case we have a cut + memcpy(dst_frame, src_frame,width >> 1); + src_frame += crop_width >> 2; + dst_frame += width >> 1; + src_frame += width >> 1; + if (pad_width) { + memset(dst_frame, 127, pad_width >> 2); + dst_frame += pad_width >> 2; + } + } + if (pad_height) { + memset(dst_frame, 127, (dst_width >> 2) * (pad_height >> 1)); + dst_frame += (dst_width >> 2) * (pad_height >> 1); + } + } else { + // cut height + int i = 0; + int pad_width = 0; + int crop_width = 0; + int width = src_width; + + if (src_width < dst_width) { + // pad width + pad_width = dst_width - src_width; + } else { + // cut width + crop_width = src_width - dst_width; + width = dst_width; + } + int diff_height = src_height - dst_height; + src_frame += src_width * (diff_height >> 1); // skip top I + + for (i = 0; i < dst_height; i++) { + if (pad_width) { + memset(dst_frame, 0, pad_width / 2); + dst_frame += pad_width / 2; + } + src_frame += crop_width >> 1; // in case we have a cut + memcpy(dst_frame,src_frame ,width); + src_frame += crop_width >> 1; + dst_frame += width; + src_frame += width; + if (pad_width) { + memset(dst_frame, 0, pad_width / 2); + dst_frame += pad_width / 2; + } + } + src_frame += src_width * (diff_height >> 1); // skip end I + src_frame += (src_width >> 2) * (diff_height >> 1); // skip top of Cr + for (i = 0; i < (dst_height >> 1); i++) { + if (pad_width) { + memset(dst_frame, 127, pad_width >> 2); + dst_frame += pad_width >> 2; + } + src_frame += crop_width >> 2; // in case we have a cut + memcpy(dst_frame, src_frame,width >> 1); + src_frame += crop_width >> 2; + dst_frame += width >> 1; + src_frame += width >> 1; + if (pad_width) { + memset(dst_frame, 127, pad_width >> 2); + dst_frame += pad_width >> 2; + } + } + src_frame += (src_width >> 2) * (diff_height >> 1); // skip end of Cr + src_frame += (src_width >> 2) * (diff_height >> 1); // skip top of Cb + for (i = 0; i < (dst_height >> 1); i++) { + if (pad_width) { + memset(dst_frame, 127, pad_width >> 2); + dst_frame += pad_width >> 2; + } + src_frame += crop_width >> 2; // in case we have a cut + memcpy(dst_frame, src_frame, width >> 1); + src_frame += crop_width >> 2; + dst_frame += width >> 1; + src_frame += width >> 1; + if (pad_width) { + memset(dst_frame, 127, pad_width >> 2); + dst_frame += pad_width >> 2; + } + } + } + } + return 0; +} + +} // namespace libyuv diff --git a/files/source/planar_functions.cc b/files/source/planar_functions.cc new file mode 100644 index 00000000..a7e3e38a --- /dev/null +++ b/files/source/planar_functions.cc @@ -0,0 +1,1575 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/planar_functions.h" + +#include <string.h> + +#include "libyuv/cpu_id.h" +#include "row.h" + +namespace libyuv { + +#if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED) +#define HAS_SPLITUV_NEON +// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v +// Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels. +static void SplitUV_NEON(const uint8* src_uv, + uint8* dst_u, uint8* dst_v, int pix) { + __asm__ volatile + ( + "1:\n" + "vld2.u8 {q0,q1}, [%0]! \n" // load 16 pairs of UV + "vst1.u8 {q0}, [%1]! \n" // store U + "vst1.u8 {q1}, [%2]! \n" // Store V + "subs %3, %3, #16 \n" // 16 processed per loop + "bhi 1b \n" + : "+r"(src_uv), + "+r"(dst_u), + "+r"(dst_v), + "+r"(pix) // Output registers + : // Input registers + : "q0", "q1" // Clobber List + ); +} + +#elif (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \ + && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) +#if defined(_MSC_VER) +#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var +#else +#define TALIGN16(t, var) t var __attribute__((aligned(16))) +#endif + +// Shuffle table for converting ABGR to ARGB. +extern "C" TALIGN16(const uint8, kShuffleMaskABGRToARGB[16]) = { + 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u +}; + +// Shuffle table for converting BGRA to ARGB. +extern "C" TALIGN16(const uint8, kShuffleMaskBGRAToARGB[16]) = { + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u +}; + +#if defined(WIN32) && !defined(COVERAGE_ENABLED) +#define HAS_SPLITUV_SSE2 +__declspec(naked) +static void SplitUV_SSE2(const uint8* src_uv, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_uv + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff + psrlw xmm7, 8 + + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + pand xmm0, xmm7 // even bytes + pand xmm1, xmm7 + packuswb xmm0, xmm1 + movdqa [edx], xmm0 + lea edx, [edx + 16] + psrlw xmm2, 8 // odd bytes + psrlw xmm3, 8 + packuswb xmm2, xmm3 + movdqa [edi], xmm2 + lea edi, [edi + 16] + sub ecx, 16 + ja wloop + pop edi + ret + } +} + +#elif (defined(__x86_64__) || defined(__i386__)) && \ + !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) +#define HAS_SPLITUV_SSE2 +static void SplitUV_SSE2(const uint8* src_uv, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile( + "pcmpeqb %%xmm7,%%xmm7\n" + "psrlw $0x8,%%xmm7\n" +"1:" + "movdqa (%0),%%xmm0\n" + "movdqa 0x10(%0),%%xmm1\n" + "lea 0x20(%0),%0\n" + "movdqa %%xmm0,%%xmm2\n" + "movdqa %%xmm1,%%xmm3\n" + "pand %%xmm7,%%xmm0\n" + "pand %%xmm7,%%xmm1\n" + "packuswb %%xmm1,%%xmm0\n" + "movdqa %%xmm0,(%1)\n" + "lea 0x10(%1),%1\n" + "psrlw $0x8,%%xmm2\n" + "psrlw $0x8,%%xmm3\n" + "packuswb %%xmm3,%%xmm2\n" + "movdqa %%xmm2,(%2)\n" + "lea 0x10(%2),%2\n" + "sub $0x10,%3\n" + "ja 1b\n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "memory" +); +} +#endif +#endif + +static void SplitUV_C(const uint8* src_uv, + uint8* dst_u, uint8* dst_v, int pix) { + // Copy a row of UV. + for (int x = 0; x < pix; ++x) { + dst_u[0] = src_uv[0]; + dst_v[0] = src_uv[1]; + src_uv += 2; + dst_u += 1; + dst_v += 1; + } +} + +static void I420CopyPlane(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + int width, int height) { + // Copy plane + for (int y = 0; y < height; ++y) { + memcpy(dst_y, src_y, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } +} + +// Copy I420 with optional flipping +int I420Copy(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + if (!src_y || !src_u || !src_v || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + int halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + I420CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + I420CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); + I420CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); + return 0; +} + +// SetRows32 writes 'count' bytes using a 32 bit value repeated + +#if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED) +#define HAS_SETROW_NEON +static void SetRow32_NEON(uint8* dst, uint32 v32, int count) { + __asm__ volatile + ( + "vdup.u32 q0, %2 \n" // duplicate 4 ints + "1:\n" + "vst1.u32 {q0}, [%0]! \n" // store + "subs %1, %1, #16 \n" // 16 processed per loop + "bhi 1b \n" + : "+r"(dst), // %0 + "+r"(count) // %1 + : "r"(v32) // %2 + : "q0", "memory" + ); +} + +#elif defined(WIN32) && !defined(COVERAGE_ENABLED) +#define HAS_SETROW_SSE2 +__declspec(naked) +static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) { + __asm { + mov eax, [esp + 4] // dst + movd xmm7, [esp + 8] // v32 + mov ecx, [esp + 12] // count + pshufd xmm7, xmm7, 0 + + wloop: + movdqa [eax], xmm7 + lea eax, [eax + 16] + sub ecx, 16 + ja wloop + ret + } +} + +#elif (defined(__x86_64__) || defined(__i386__)) && \ + !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) + +#define HAS_SETROW_SSE2 +static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) { + asm volatile( + "movd %2, %%xmm7\n" + "pshufd $0x0,%%xmm7,%%xmm7\n" +"1:" + "movdqa %%xmm7,(%0)\n" + "lea 0x10(%0),%0\n" + "sub $0x10,%1\n" + "ja 1b\n" + : "+r"(dst), // %0 + "+r"(count) // %1 + : "r"(v32) // %2 + : "memory" +); +} +#endif + +static void SetRow8_C(uint8* dst, uint32 v8, int count) { + memset(dst, v8, count); +} + +static void I420SetPlane(uint8* dst_y, int dst_stride_y, + int width, int height, + int value) { + void (*SetRow)(uint8* dst, uint32 value, int pix); +#if defined(HAS_SETROW_NEON) + if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) && + (width % 16 == 0) && + IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { + SetRow = SetRow32_NEON; + } else +#elif defined(HAS_SETROW_SSE2) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && + (width % 16 == 0) && + IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { + SetRow = SetRow32_SSE2; + } else +#endif + { + SetRow = SetRow8_C; + } + + uint32 v32 = value | (value << 8) | (value << 16) | (value << 24); + // Set plane + for (int y = 0; y < height; ++y) { + SetRow(dst_y, v32, width); + dst_y += dst_stride_y; + } +} + +// Draw a rectangle into I420 +int I420Rect(uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int x, int y, + int width, int height, + int value_y, int value_u, int value_v) { + if (!dst_y || !dst_u || !dst_v || + width <= 0 || height == 0 || + x < 0 || y < 0 || + value_y < 0 || value_y > 255 || + value_u < 0 || value_u > 255 || + value_v < 0 || value_v > 255) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + int halfheight = (height + 1) >> 1; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_u = dst_u + (halfheight - 1) * dst_stride_u; + dst_v = dst_v + (halfheight - 1) * dst_stride_v; + dst_stride_y = -dst_stride_y; + dst_stride_u = -dst_stride_u; + dst_stride_v = -dst_stride_v; + } + + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + uint8* start_y = dst_y + y * dst_stride_y + x; + uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2); + uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2); + + I420SetPlane(start_y, dst_stride_y, width, height, value_y); + I420SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u); + I420SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v); + return 0; +} + +// Helper function to copy yuv data without scaling. Used +// by our jpeg conversion callbacks to incrementally fill a yuv image. +int I422ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + // Copy Y plane + I420CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + + // SubSample UV planes. + int x, y; + int halfwidth = (width + 1) >> 1; + for (y = 0; y < height; y += 2) { + const uint8* u0 = src_u; + const uint8* u1 = src_u + src_stride_u; + if ((y + 1) >= height) { + u1 = u0; + } + for (x = 0; x < halfwidth; ++x) { + dst_u[x] = (u0[x] + u1[x] + 1) >> 1; + } + src_u += src_stride_u * 2; + dst_u += dst_stride_u; + } + for (y = 0; y < height; y += 2) { + const uint8* v0 = src_v; + const uint8* v1 = src_v + src_stride_v; + if ((y + 1) >= height) { + v1 = v0; + } + for (x = 0; x < halfwidth; ++x) { + dst_v[x] = (v0[x] + v1[x] + 1) >> 1; + } + src_v += src_stride_v * 2; + dst_v += dst_stride_v; + } + return 0; +} + +static void I420CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, + uint8* dst, int dst_stride_frame, + int width, int height) { + // Copy plane + for (int y = 0; y < height; y += 2) { + memcpy(dst, src, width); + src += src_stride_0; + dst += dst_stride_frame; + memcpy(dst, src, width); + src += src_stride_1; + dst += dst_stride_frame; + } +} + +// Support converting from FOURCC_M420 +// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for +// easy conversion to I420. +// M420 format description: +// M420 is row biplanar 420: 2 rows of Y and 1 row of VU. +// Chroma is half width / half height. (420) +// src_stride_m420 is row planar. Normally this will be the width in pixels. +// The UV plane is half width, but 2 values, so src_stride_m420 applies to +// this as well as the two Y planes. +static int X420ToI420(const uint8* src_y, + int src_stride_y0, int src_stride_y1, + const uint8* src_uv, int src_stride_uv, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + int halfheight = (height + 1) >> 1; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_u = dst_u + (halfheight - 1) * dst_stride_u; + dst_v = dst_v + (halfheight - 1) * dst_stride_v; + dst_stride_y = -dst_stride_y; + dst_stride_u = -dst_stride_u; + dst_stride_v = -dst_stride_v; + } + + int halfwidth = (width + 1) >> 1; + void (*SplitUV)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); +#if defined(HAS_SPLITUV_NEON) + if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) && + (halfwidth % 16 == 0) && + IS_ALIGNED(src_uv, 16) && (src_stride_uv % 16 == 0) && + IS_ALIGNED(dst_u, 16) && (dst_stride_u % 16 == 0) && + IS_ALIGNED(dst_v, 16) && (dst_stride_v % 16 == 0)) { + SplitUV = SplitUV_NEON; + } else +#elif defined(HAS_SPLITUV_SSE2) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && + (halfwidth % 16 == 0) && + IS_ALIGNED(src_uv, 16) && (src_stride_uv % 16 == 0) && + IS_ALIGNED(dst_u, 16) && (dst_stride_u % 16 == 0) && + IS_ALIGNED(dst_v, 16) && (dst_stride_v % 16 == 0)) { + SplitUV = SplitUV_SSE2; + } else +#endif + { + SplitUV = SplitUV_C; + } + + I420CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y, + width, height); + + int halfheight = (height + 1) >> 1; + for (int y = 0; y < halfheight; ++y) { + // Copy a row of UV. + SplitUV(src_uv, dst_u, dst_v, halfwidth); + dst_u += dst_stride_u; + dst_v += dst_stride_v; + src_uv += src_stride_uv; + } + return 0; +} + +// Convert M420 to I420. +int M420ToI420(const uint8* src_m420, int src_stride_m420, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2, + src_m420 + src_stride_m420 * 2, src_stride_m420 * 3, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height); +} + +// Convert NV12 to I420. +int NV12ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + return X420ToI420(src_y, src_stride_y, src_stride_y, + src_uv, src_stride_uv, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height); +} + +// Convert NV12 to I420. Deprecated. +int NV12ToI420(const uint8* src_y, + const uint8* src_uv, + int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + return X420ToI420(src_y, src_stride_frame, src_stride_frame, + src_uv, src_stride_frame, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height); +} + +#if defined(WIN32) && !defined(COVERAGE_ENABLED) +#define HAS_SPLITYUY2_SSE2 +__declspec(naked) +static void SplitYUY2_SSE2(const uint8* src_yuy2, + uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov edx, [esp + 8 + 8] // dst_y + mov esi, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff + psrlw xmm7, 8 + + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + pand xmm2, xmm7 // even bytes are Y + pand xmm3, xmm7 + packuswb xmm2, xmm3 + movdqa [edx], xmm2 + lea edx, [edx + 16] + psrlw xmm0, 8 // YUYV -> UVUV + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm7 // U + packuswb xmm0, xmm0 + movq qword ptr [esi], xmm0 + lea esi, [esi + 8] + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edi], xmm1 + lea edi, [edi + 8] + sub ecx, 16 + ja wloop + + pop edi + pop esi + ret + } +} + +#elif (defined(__x86_64__) || defined(__i386__)) && \ + !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) +#define HAS_SPLITYUY2_SSE2 +static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile( + "pcmpeqb %%xmm7,%%xmm7\n" + "psrlw $0x8,%%xmm7\n" +"1:" + "movdqa (%0),%%xmm0\n" + "movdqa 0x10(%0),%%xmm1\n" + "lea 0x20(%0),%0\n" + "movdqa %%xmm0,%%xmm2\n" + "movdqa %%xmm1,%%xmm3\n" + "pand %%xmm7,%%xmm2\n" + "pand %%xmm7,%%xmm3\n" + "packuswb %%xmm3,%%xmm2\n" + "movdqa %%xmm2,(%1)\n" + "lea 0x10(%1),%1\n" + "psrlw $0x8,%%xmm0\n" + "psrlw $0x8,%%xmm1\n" + "packuswb %%xmm1,%%xmm0\n" + "movdqa %%xmm0,%%xmm1\n" + "pand %%xmm7,%%xmm0\n" + "packuswb %%xmm0,%%xmm0\n" + "movq %%xmm0,(%2)\n" + "lea 0x8(%2),%2\n" + "psrlw $0x8,%%xmm1\n" + "packuswb %%xmm1,%%xmm1\n" + "movq %%xmm1,(%3)\n" + "lea 0x8(%3),%3\n" + "sub $0x10,%4\n" + "ja 1b\n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "memory" +); +} +#endif + +static void SplitYUY2_C(const uint8* src_yuy2, + uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix) { + // Copy a row of YUY2. + for (int x = 0; x < pix; x += 2) { + dst_y[0] = src_yuy2[0]; + dst_y[1] = src_yuy2[2]; + dst_u[0] = src_yuy2[1]; + dst_v[0] = src_yuy2[3]; + src_yuy2 += 4; + dst_y += 2; + dst_u += 1; + dst_v += 1; + } +} + +// Convert Q420 to I420. +// Format is rows of YY/YUYV +int Q420ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + int halfheight = (height + 1) >> 1; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_u = dst_u + (halfheight - 1) * dst_stride_u; + dst_v = dst_v + (halfheight - 1) * dst_stride_v; + dst_stride_y = -dst_stride_y; + dst_stride_u = -dst_stride_u; + dst_stride_v = -dst_stride_v; + } + void (*SplitYUY2)(const uint8* src_yuy2, + uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix); +#if defined(HAS_SPLITYUY2_SSE2) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && + (width % 16 == 0) && + IS_ALIGNED(src_yuy2, 16) && (src_stride_yuy2 % 16 == 0) && + IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) && + IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && + IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) { + SplitYUY2 = SplitYUY2_SSE2; + } else +#endif + { + SplitYUY2 = SplitYUY2_C; + } + for (int y = 0; y < height; y += 2) { + memcpy(dst_y, src_y, width); + dst_y += dst_stride_y; + src_y += src_stride_y; + + // Copy a row of YUY2. + SplitYUY2(src_yuy2, dst_y, dst_u, dst_v, width); + dst_y += dst_stride_y; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + src_yuy2 += src_stride_yuy2; + } + return 0; +} + +#if defined(WIN32) && !defined(COVERAGE_ENABLED) +#define HAS_YUY2TOI420ROW_SSE2 +__declspec(naked) +void YUY2ToI420RowY_SSE2(const uint8* src_yuy2, + uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] // src_yuy2 + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // pix + pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff + psrlw xmm7, 8 + + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm7 // even bytes are Y + pand xmm1, xmm7 + packuswb xmm0, xmm1 + movdqa [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + ja wloop + ret + } +} + +__declspec(naked) +void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_y, int pix) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff + psrlw xmm7, 8 + + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + esi] + movdqa xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + psrlw xmm0, 8 // YUYV -> UVUV + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm7 // U + packuswb xmm0, xmm0 + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edi], xmm1 + lea edi, [edi + 8] + sub ecx, 16 + ja wloop + + pop edi + pop esi + ret + } +} + +#define HAS_UYVYTOI420ROW_SSE2 +__declspec(naked) +void UYVYToI420RowY_SSE2(const uint8* src_uyvy, + uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] // src_uyvy + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // pix + + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + psrlw xmm0, 8 // odd bytes are Y + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqa [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + ja wloop + ret + } +} + +__declspec(naked) +void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_y, int pix) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff + psrlw xmm7, 8 + + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + esi] + movdqa xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + pand xmm0, xmm7 // UYVY -> UVUV + pand xmm1, xmm7 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm7 // U + packuswb xmm0, xmm0 + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edi], xmm1 + lea edi, [edi + 8] + sub ecx, 16 + ja wloop + + pop edi + pop esi + ret + } +} + +#elif (defined(__x86_64__) || defined(__i386__)) && \ + !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) + +#define HAS_YUY2TOI420ROW_SSE2 +static void YUY2ToI420RowY_SSE2(const uint8* src_yuy2, + uint8* dst_y, int pix) { + asm volatile( + "pcmpeqb %%xmm7,%%xmm7\n" + "psrlw $0x8,%%xmm7\n" +"1:" + "movdqa (%0),%%xmm0\n" + "movdqa 0x10(%0),%%xmm1\n" + "lea 0x20(%0),%0\n" + "pand %%xmm7,%%xmm0\n" + "pand %%xmm7,%%xmm1\n" + "packuswb %%xmm1,%%xmm0\n" + "movdqa %%xmm0,(%1)\n" + "lea 0x10(%1),%1\n" + "sub $0x10,%2\n" + "ja 1b\n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "memory" +); +} + +static void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_y, int pix) { + asm volatile( + "pcmpeqb %%xmm7,%%xmm7\n" + "psrlw $0x8,%%xmm7\n" +"1:" + "movdqa (%0),%%xmm0\n" + "movdqa 0x10(%0),%%xmm1\n" + "movdqa (%0,%4,1),%%xmm2\n" + "movdqa 0x10(%0,%4,1),%%xmm3\n" + "lea 0x20(%0),%0\n" + "pavgb %%xmm2,%%xmm0\n" + "pavgb %%xmm3,%%xmm1\n" + "psrlw $0x8,%%xmm0\n" + "psrlw $0x8,%%xmm1\n" + "packuswb %%xmm1,%%xmm0\n" + "movdqa %%xmm0,%%xmm1\n" + "pand %%xmm7,%%xmm0\n" + "packuswb %%xmm0,%%xmm0\n" + "movq %%xmm0,(%1)\n" + "lea 0x8(%1),%1\n" + "psrlw $0x8,%%xmm1\n" + "packuswb %%xmm1,%%xmm1\n" + "movq %%xmm1,(%2)\n" + "lea 0x8(%2),%2\n" + "sub $0x10,%3\n" + "ja 1b\n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_y), // %2 + "+r"(pix) // %3 + : "r"(static_cast<intptr_t>(stride_yuy2)) // %4 + : "memory" +); +} +#define HAS_UYVYTOI420ROW_SSE2 +static void UYVYToI420RowY_SSE2(const uint8* src_uyvy, + uint8* dst_y, int pix) { + asm volatile( +"1:" + "movdqa (%0),%%xmm0\n" + "movdqa 0x10(%0),%%xmm1\n" + "lea 0x20(%0),%0\n" + "psrlw $0x8,%%xmm0\n" + "psrlw $0x8,%%xmm1\n" + "packuswb %%xmm1,%%xmm0\n" + "movdqa %%xmm0,(%1)\n" + "lea 0x10(%1),%1\n" + "sub $0x10,%2\n" + "ja 1b\n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "memory" +); +} + +static void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_y, int pix) { + asm volatile( + "pcmpeqb %%xmm7,%%xmm7\n" + "psrlw $0x8,%%xmm7\n" +"1:" + "movdqa (%0),%%xmm0\n" + "movdqa 0x10(%0),%%xmm1\n" + "movdqa (%0,%4,1),%%xmm2\n" + "movdqa 0x10(%0,%4,1),%%xmm3\n" + "lea 0x20(%0),%0\n" + "pavgb %%xmm2,%%xmm0\n" + "pavgb %%xmm3,%%xmm1\n" + "pand %%xmm7,%%xmm0\n" + "pand %%xmm7,%%xmm1\n" + "packuswb %%xmm1,%%xmm0\n" + "movdqa %%xmm0,%%xmm1\n" + "pand %%xmm7,%%xmm0\n" + "packuswb %%xmm0,%%xmm0\n" + "movq %%xmm0,(%1)\n" + "lea 0x8(%1),%1\n" + "psrlw $0x8,%%xmm1\n" + "packuswb %%xmm1,%%xmm1\n" + "movq %%xmm1,(%2)\n" + "lea 0x8(%2),%2\n" + "sub $0x10,%3\n" + "ja 1b\n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_y), // %2 + "+r"(pix) // %3 + : "r"(static_cast<intptr_t>(stride_uyvy)) // %4 + : "memory" +); +} +#endif + +// Filter 2 rows of YUY2 UV's (422) into U and V (420) +void YUY2ToI420RowUV_C(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + // Output a row of UV values, filtering 2 rows of YUY2 + for (int x = 0; x < pix; x += 2) { + dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1; + dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1; + src_yuy2 += 4; + dst_u += 1; + dst_v += 1; + } +} + +void YUY2ToI420RowY_C(const uint8* src_yuy2, + uint8* dst_y, int pix) { + // Copy a row of yuy2 Y values + for (int x = 0; x < pix; ++x) { + dst_y[0] = src_yuy2[0]; + src_yuy2 += 2; + dst_y += 1; + } +} + +void UYVYToI420RowUV_C(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + // Copy a row of uyvy UV values + for (int x = 0; x < pix; x += 2) { + dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1; + dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1; + src_uyvy += 4; + dst_u += 1; + dst_v += 1; + } +} + +void UYVYToI420RowY_C(const uint8* src_uyvy, + uint8* dst_y, int pix) { + // Copy a row of uyvy Y values + for (int x = 0; x < pix; ++x) { + dst_y[0] = src_uyvy[1]; + src_uyvy += 2; + dst_y += 1; + } +} + +// Convert YUY2 to I420. +int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; + src_stride_yuy2 = -src_stride_yuy2; + } + void (*YUY2ToI420RowUV)(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); + void (*YUY2ToI420RowY)(const uint8* src_yuy2, + uint8* dst_y, int pix); +#if defined(HAS_YUY2TOI420ROW_SSE2) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && + (width % 16 == 0) && + IS_ALIGNED(src_yuy2, 16) && (src_stride_yuy2 % 16 == 0) && + IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) && + IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && + IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) { + YUY2ToI420RowY = YUY2ToI420RowY_SSE2; + YUY2ToI420RowUV = YUY2ToI420RowUV_SSE2; + } else +#endif + { + YUY2ToI420RowY = YUY2ToI420RowY_C; + YUY2ToI420RowUV = YUY2ToI420RowUV_C; + } + for (int y = 0; y < height; ++y) { + if ((y & 1) == 0) { + if (y >= (height - 1) ) { // last chroma on odd height clamp height + src_stride_yuy2 = 0; + } + YUY2ToI420RowUV(src_yuy2, src_stride_yuy2, dst_u, dst_v, width); + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + YUY2ToI420RowY(src_yuy2, dst_y, width); + dst_y += dst_stride_y; + src_yuy2 += src_stride_yuy2; + } + return 0; +} + +// Convert UYVY to I420. +int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; + src_stride_uyvy = -src_stride_uyvy; + } + void (*UYVYToI420RowUV)(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); + void (*UYVYToI420RowY)(const uint8* src_uyvy, + uint8* dst_y, int pix); +#if defined(HAS_UYVYTOI420ROW_SSE2) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && + (width % 16 == 0) && + IS_ALIGNED(src_uyvy, 16) && (src_stride_uyvy % 16 == 0) && + IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) && + IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && + IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) { + UYVYToI420RowY = UYVYToI420RowY_SSE2; + UYVYToI420RowUV = UYVYToI420RowUV_SSE2; + } else +#endif + { + UYVYToI420RowY = UYVYToI420RowY_C; + UYVYToI420RowUV = UYVYToI420RowUV_C; + } + for (int y = 0; y < height; ++y) { + if ((y & 1) == 0) { + if (y >= (height - 1) ) { // last chroma on odd height clamp height + src_stride_uyvy = 0; + } + UYVYToI420RowUV(src_uyvy, src_stride_uyvy, dst_u, dst_v, width); + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + UYVYToI420RowY(src_uyvy, dst_y, width); + dst_y += dst_stride_y; + src_uyvy += src_stride_uyvy; + } + return 0; +} + +// Convert I420 to ARGB. +// TODO(fbarchard): Add SSE2 version and supply C version for fallback. +int I420ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + for (int y = 0; y < height; ++y) { + FastConvertYUVToRGB32Row(src_y, src_u, src_v, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + // MMX used for FastConvertYUVToRGB32Row requires an emms instruction. + EMMS(); + return 0; +} + +// Convert I420 to BGRA. +int I420ToBGRA(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + for (int y = 0; y < height; ++y) { + FastConvertYUVToBGRARow(src_y, src_u, src_v, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + EMMS(); + return 0; +} + +// Convert I420 to BGRA. +int I420ToABGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + for (int y = 0; y < height; ++y) { + FastConvertYUVToABGRRow(src_y, src_u, src_v, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + EMMS(); + return 0; +} + +// Convert I422 to ARGB. +int I422ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + for (int y = 0; y < height; ++y) { + FastConvertYUVToRGB32Row(src_y, src_u, src_v, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + // MMX used for FastConvertYUVToRGB32Row requires an emms instruction. + EMMS(); + return 0; +} + +// Convert I444 to ARGB. +int I444ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + for (int y = 0; y < height; ++y) { + FastConvertYUV444ToRGB32Row(src_y, src_u, src_v, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + // MMX used for FastConvertYUVToRGB32Row requires an emms instruction. + EMMS(); + return 0; +} + +// Convert I400 to ARGB. +int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + for (int y = 0; y < height; ++y) { + FastConvertYToRGB32Row(src_y, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + } + // MMX used for FastConvertYUVToRGB32Row requires an emms instruction. + EMMS(); + return 0; +} + +// TODO(fbarchard): 64 bit version +#if defined(WIN32) && !defined(COVERAGE_ENABLED) + +#define HAS_I400TOARGBROW_SSE2 +__declspec(naked) +static void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { + __asm { + mov eax, [esp + 4] // src_y + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + pcmpeqb xmm7, xmm7 // generate mask 0xff000000 + pslld xmm7, 24 + + wloop: + movq xmm0, qword ptr [eax] + lea eax, [eax + 8] + punpcklbw xmm0, xmm0 + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm0 + punpckhwd xmm1, xmm1 + por xmm0, xmm7 + por xmm1, xmm7 + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + ja wloop + ret + } +} + +#define HAS_ABGRTOARGBROW_SSSE3 +__declspec(naked) +static void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, + int pix) { +__asm { + mov eax, [esp + 4] // src_abgr + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + movdqa xmm7, _kShuffleMaskABGRToARGB + + convertloop : + movdqa xmm0, [eax] + lea eax, [eax + 16] + pshufb xmm0, xmm7 + movdqa [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + ja convertloop + ret + } +} + +#define HAS_BGRATOARGBROW_SSSE3 +__declspec(naked) +static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, + int pix) { +__asm { + mov eax, [esp + 4] // src_bgra + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + movdqa xmm7, _kShuffleMaskBGRAToARGB + + convertloop : + movdqa xmm0, [eax] + lea eax, [eax + 16] + pshufb xmm0, xmm7 + movdqa [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + ja convertloop + ret + } +} + + +#elif (defined(__x86_64__) || defined(__i386__)) && \ + !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) + +// TODO(yuche): consider moving ARGB related codes to a separate file. +#define HAS_I400TOARGBROW_SSE2 +static void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { + asm volatile( + "pcmpeqb %%xmm7,%%xmm7\n" + "pslld $0x18,%%xmm7\n" +"1:" + "movq (%0),%%xmm0\n" + "lea 0x8(%0),%0\n" + "punpcklbw %%xmm0,%%xmm0\n" + "movdqa %%xmm0,%%xmm1\n" + "punpcklwd %%xmm0,%%xmm0\n" + "punpckhwd %%xmm1,%%xmm1\n" + "por %%xmm7,%%xmm0\n" + "por %%xmm7,%%xmm1\n" + "movdqa %%xmm0,(%1)\n" + "movdqa %%xmm1,0x10(%1)\n" + "lea 0x20(%1),%1\n" + "sub $0x8,%2\n" + "ja 1b\n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "memory" +); +} + +#define HAS_ABGRTOARGBROW_SSSE3 +static void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, + int pix) { + asm volatile( + "movdqa (%3),%%xmm7\n" +"1:" + "movdqa (%0),%%xmm0\n" + "lea 0x10(%0),%0\n" + "pshufb %%xmm7,%%xmm0\n" + "movdqa %%xmm0,(%1)\n" + "lea 0x10(%1),%1\n" + "sub $0x4,%2\n" + "ja 1b\n" + : "+r"(src_abgr), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "r"(kShuffleMaskABGRToARGB) // %3 + : "memory" +); +} + +#define HAS_BGRATOARGBROW_SSSE3 +static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, + int pix) { + asm volatile( + "movdqa (%3),%%xmm7\n" +"1:" + "movdqa (%0),%%xmm0\n" + "lea 0x10(%0),%0\n" + "pshufb %%xmm7,%%xmm0\n" + "movdqa %%xmm0,(%1)\n" + "lea 0x10(%1),%1\n" + "sub $0x4,%2\n" + "ja 1b\n" + : "+r"(src_bgra), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "r"(kShuffleMaskBGRAToARGB) // %3 + : "memory" +); +} + +#endif + +static void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) { + // Copy a Y to RGB. + for (int x = 0; x < pix; ++x) { + uint8 y = src_y[0]; + dst_argb[2] = dst_argb[1] = dst_argb[0] = y; + dst_argb[3] = 255u; + dst_argb += 4; + ++src_y; + } +} + +// Convert I400 to ARGB. +int I400ToARGB(const uint8* src_y, int src_stride_y, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix); +#if defined(HAS_I400TOARGBROW_SSE2) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && + (width % 8 == 0) && + IS_ALIGNED(src_y, 8) && (src_stride_y % 8 == 0) && + IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { + I400ToARGBRow = I400ToARGBRow_SSE2; + } else +#endif + { + I400ToARGBRow = I400ToARGBRow_C; + } + + for (int y = 0; y < height; ++y) { + I400ToARGBRow(src_y, dst_argb, width); + src_y += src_stride_y; + dst_argb += dst_stride_argb; + } + return 0; +} + +static void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) { + for (int x = 0; x < pix; ++x) { + // To support in-place conversion. + uint8 r = src_abgr[0]; + uint8 g = src_abgr[1]; + uint8 b = src_abgr[2]; + uint8 a = src_abgr[3]; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = a; + dst_argb += 4; + src_abgr += 4; + } +} + +int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (height < 0) { + height = -height; + src_abgr = src_abgr + (height - 1) * src_stride_abgr; + src_stride_abgr = -src_stride_abgr; + } +void (*ABGRToARGBRow)(const uint8* src_abgr, uint8* dst_argb, int pix); +#if defined(HAS_ABGRTOARGBROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 4 == 0) && + IS_ALIGNED(src_abgr, 16) && (src_stride_abgr % 16 == 0) && + IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { + ABGRToARGBRow = ABGRToARGBRow_SSSE3; + } else +#endif + { + ABGRToARGBRow = ABGRToARGBRow_C; + } + + for (int y = 0; y < height; ++y) { + ABGRToARGBRow(src_abgr, dst_argb, width); + src_abgr += src_stride_abgr; + dst_argb += dst_stride_argb; + } + return 0; +} + +static void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix) { + for (int x = 0; x < pix; ++x) { + // To support in-place conversion. + uint8 a = src_bgra[0]; + uint8 r = src_bgra[1]; + uint8 g = src_bgra[2]; + uint8 b = src_bgra[3]; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = a; + dst_argb += 4; + src_bgra += 4; + } +} + +// Convert BGRA to ARGB. +int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (height < 0) { + height = -height; + src_bgra = src_bgra + (height - 1) * src_stride_bgra; + src_stride_bgra = -src_stride_bgra; + } + void (*BGRAToARGBRow)(const uint8* src_bgra, uint8* dst_argb, int pix); +#if defined(HAS_BGRATOARGBROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 4 == 0) && + IS_ALIGNED(src_bgra, 16) && (src_stride_bgra % 16 == 0) && + IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { + BGRAToARGBRow = BGRAToARGBRow_SSSE3; + } else +#endif + { + BGRAToARGBRow = BGRAToARGBRow_C; + } + + for (int y = 0; y < height; ++y) { + BGRAToARGBRow(src_bgra, dst_argb, width); + src_bgra += src_stride_bgra; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert ARGB to I400. +int ARGBToI400(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + int width, int height) { + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } +void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix); +#if defined(HAS_ARGBTOYROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 4 == 0) && + IS_ALIGNED(src_argb, 16) && (src_stride_argb % 16 == 0) && + IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } else +#endif + { + ARGBToYRow = ARGBToYRow_C; + } + + for (int y = 0; y < height; ++y) { + ARGBToYRow(src_argb, dst_y, width); + src_argb += src_stride_argb; + dst_y += dst_stride_y; + } + return 0; +} + + +// Convert RAW to ARGB. +int RAWToARGB(const uint8* src_raw, int src_stride_raw, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (height < 0) { + height = -height; + src_raw = src_raw + (height - 1) * src_stride_raw; + src_stride_raw = -src_stride_raw; + } + void (*RAWToARGBRow)(const uint8* src_raw, uint8* dst_argb, int pix); +#if defined(HAS_RAWTOARGBROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 16 == 0) && + IS_ALIGNED(src_raw, 16) && (src_stride_raw % 16 == 0) && + IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { + RAWToARGBRow = RAWToARGBRow_SSSE3; + } else +#endif + { + RAWToARGBRow = RAWToARGBRow_C; + } + + for (int y = 0; y < height; ++y) { + RAWToARGBRow(src_raw, dst_argb, width); + src_raw += src_stride_raw; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert BG24 to ARGB. +int BG24ToARGB(const uint8* src_bg24, int src_stride_bg24, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (height < 0) { + height = -height; + src_bg24 = src_bg24 + (height - 1) * src_stride_bg24; + src_stride_bg24 = -src_stride_bg24; + } + void (*BG24ToARGBRow)(const uint8* src_bg24, uint8* dst_argb, int pix); +#if defined(HAS_BG24TOARGBROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 16 == 0) && + IS_ALIGNED(src_bg24, 16) && (src_stride_bg24 % 16 == 0) && + IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { + BG24ToARGBRow = BG24ToARGBRow_SSSE3; + } else +#endif + { + BG24ToARGBRow = BG24ToARGBRow_C; + } + + for (int y = 0; y < height; ++y) { + BG24ToARGBRow(src_bg24, dst_argb, width); + src_bg24 += src_stride_bg24; + dst_argb += dst_stride_argb; + } + return 0; +} + +} // namespace libyuv + diff --git a/files/source/rotate.cc b/files/source/rotate.cc new file mode 100644 index 00000000..7d3a3324 --- /dev/null +++ b/files/source/rotate.cc @@ -0,0 +1,1310 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/planar_functions.h" +#include "libyuv/rotate.h" +#include "rotate_priv.h" + +#include "libyuv/cpu_id.h" + +namespace libyuv { + +#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \ + && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) +#if defined(_MSC_VER) +#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var +#else +#define TALIGN16(t, var) t var __attribute__((aligned(16))) +#endif +// Shuffle table for reversing the bytes. +extern "C" TALIGN16(const uint8, kShuffleReverse[16]) = + { 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u }; +// Shuffle table for reversing the bytes of UV channels. +extern "C" TALIGN16(const uint8, kShuffleReverseUV[16]) = + { 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u }; +#endif + +typedef void (*reverse_uv_func)(const uint8*, uint8*, uint8*, int); +typedef void (*reverse_func)(const uint8*, uint8*, int); +typedef void (*rotate_uv_wx8_func)(const uint8*, int, + uint8*, int, + uint8*, int, int); +typedef void (*rotate_uv_wxh_func)(const uint8*, int, + uint8*, int, + uint8*, int, int, int); +typedef void (*rotate_wx8_func)(const uint8*, int, uint8*, int, int); +typedef void (*rotate_wxh_func)(const uint8*, int, uint8*, int, int, int); + +#ifdef __ARM_NEON__ +extern "C" { +void RestoreRegisters_NEON(unsigned long long *restore); +void SaveRegisters_NEON(unsigned long long *store); +#define HAS_REVERSE_LINE_NEON +void ReverseLine_NEON(const uint8* src, uint8* dst, int width); +#define HAS_REVERSE_LINE_UV_NEON +void ReverseLineUV_NEON(const uint8* src, + uint8* dst_a, uint8* dst_b, + int width); +#define HAS_TRANSPOSE_WX8_NEON +void TransposeWx8_NEON(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width); +#define HAS_TRANSPOSE_UVWX8_NEON +void TransposeUVWx8_NEON(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width); +} // extern "C" +#endif + +#if defined(WIN32) && !defined(COVERAGE_ENABLED) +#define HAS_TRANSPOSE_WX8_SSSE3 +__declspec(naked) +static void TransposeWx8_SSSE3(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width) { +__asm { + push edi + push esi + push ebp + mov eax, [esp + 12 + 4] // src + mov edi, [esp + 12 + 8] // src_stride + mov edx, [esp + 12 + 12] // dst + mov esi, [esp + 12 + 16] // dst_stride + mov ecx, [esp + 12 + 20] // width + convertloop : + // Read in the data from the source pointer. + // First round of bit swap. + movq xmm0, qword ptr [eax] + lea ebp, [eax + 8] + movq xmm1, qword ptr [eax + edi] + lea eax, [eax + 2 * edi] + punpcklbw xmm0, xmm1 + movq xmm2, qword ptr [eax] + movdqa xmm1, xmm0 + palignr xmm1, xmm1, 8 + movq xmm3, qword ptr [eax + edi] + lea eax, [eax + 2 * edi] + punpcklbw xmm2, xmm3 + movdqa xmm3, xmm2 + movq xmm4, qword ptr [eax] + palignr xmm3, xmm3, 8 + movq xmm5, qword ptr [eax + edi] + punpcklbw xmm4, xmm5 + lea eax, [eax + 2 * edi] + movdqa xmm5, xmm4 + movq xmm6, qword ptr [eax] + palignr xmm5, xmm5, 8 + movq xmm7, qword ptr [eax + edi] + punpcklbw xmm6, xmm7 + mov eax, ebp + movdqa xmm7, xmm6 + palignr xmm7, xmm7, 8 + // Second round of bit swap. + punpcklwd xmm0, xmm2 + punpcklwd xmm1, xmm3 + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + palignr xmm2, xmm2, 8 + palignr xmm3, xmm3, 8 + punpcklwd xmm4, xmm6 + punpcklwd xmm5, xmm7 + movdqa xmm6, xmm4 + movdqa xmm7, xmm5 + palignr xmm6, xmm6, 8 + palignr xmm7, xmm7, 8 + // Third round of bit swap. + // Write to the destination pointer. + punpckldq xmm0, xmm4 + movq qword ptr [edx], xmm0 + movdqa xmm4, xmm0 + palignr xmm4, xmm4, 8 + movq qword ptr [edx + esi], xmm4 + lea edx, [edx + 2 * esi] + punpckldq xmm2, xmm6 + movdqa xmm6, xmm2 + palignr xmm6, xmm6, 8 + movq qword ptr [edx], xmm2 + punpckldq xmm1, xmm5 + movq qword ptr [edx + esi], xmm6 + lea edx, [edx + 2 * esi] + movdqa xmm5, xmm1 + movq qword ptr [edx], xmm1 + palignr xmm5, xmm5, 8 + punpckldq xmm3, xmm7 + movq qword ptr [edx + esi], xmm5 + lea edx, [edx + 2 * esi] + movq qword ptr [edx], xmm3 + movdqa xmm7, xmm3 + palignr xmm7, xmm7, 8 + movq qword ptr [edx + esi], xmm7 + lea edx, [edx + 2 * esi] + sub ecx, 8 + ja convertloop + + pop ebp + pop esi + pop edi + ret + } +} + +#define HAS_TRANSPOSE_UVWX8_SSE2 +__declspec(naked) +static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int w) { +__asm { + push ebx + push esi + push edi + push ebp + mov eax, [esp + 16 + 4] // src + mov edi, [esp + 16 + 8] // src_stride + mov edx, [esp + 16 + 12] // dst_a + mov esi, [esp + 16 + 16] // dst_stride_a + mov ebx, [esp + 16 + 20] // dst_b + mov ebp, [esp + 16 + 24] // dst_stride_b + mov ecx, esp + sub esp, 4 + 16 + and esp, ~15 + mov [esp + 16], ecx + mov ecx, [ecx + 16 + 28] // w + convertloop : + // Read in the data from the source pointer. + // First round of bit swap. + movdqa xmm0, [eax] + movdqa xmm1, [eax + edi] + lea eax, [eax + 2 * edi] + movdqa xmm7, xmm0 // use xmm7 as temp register. + punpcklbw xmm0, xmm1 + punpckhbw xmm7, xmm1 + movdqa xmm1, xmm7 + movdqa xmm2, [eax] + movdqa xmm3, [eax + edi] + lea eax, [eax + 2 * edi] + movdqa xmm7, xmm2 + punpcklbw xmm2, xmm3 + punpckhbw xmm7, xmm3 + movdqa xmm3, xmm7 + movdqa xmm4, [eax] + movdqa xmm5, [eax + edi] + lea eax, [eax + 2 * edi] + movdqa xmm7, xmm4 + punpcklbw xmm4, xmm5 + punpckhbw xmm7, xmm5 + movdqa xmm5, xmm7 + movdqa xmm6, [eax] + movdqa xmm7, [eax + edi] + lea eax, [eax + 2 * edi] + movdqa [esp], xmm5 // backup xmm5 + neg edi + movdqa xmm5, xmm6 // use xmm5 as temp register. + punpcklbw xmm6, xmm7 + punpckhbw xmm5, xmm7 + movdqa xmm7, xmm5 + lea eax, [eax + 8 * edi + 16] + neg edi + // Second round of bit swap. + movdqa xmm5, xmm0 + punpcklwd xmm0, xmm2 + punpckhwd xmm5, xmm2 + movdqa xmm2, xmm5 + movdqa xmm5, xmm1 + punpcklwd xmm1, xmm3 + punpckhwd xmm5, xmm3 + movdqa xmm3, xmm5 + movdqa xmm5, xmm4 + punpcklwd xmm4, xmm6 + punpckhwd xmm5, xmm6 + movdqa xmm6, xmm5 + movdqa xmm5, [esp] // restore xmm5 + movdqa [esp], xmm6 // backup xmm6 + movdqa xmm6, xmm5 // use xmm6 as temp register. + punpcklwd xmm5, xmm7 + punpckhwd xmm6, xmm7 + movdqa xmm7, xmm6 + // Third round of bit swap. + // Write to the destination pointer. + movdqa xmm6, xmm0 + punpckldq xmm0, xmm4 + punpckhdq xmm6, xmm4 + movdqa xmm4, xmm6 + movdqa xmm6, [esp] // restore xmm6 + movlpd qword ptr [edx], xmm0 + movhpd qword ptr [ebx], xmm0 + movlpd qword ptr [edx + esi], xmm4 + lea edx, [edx + 2 * esi] + movhpd qword ptr [ebx + ebp], xmm4 + lea ebx, [ebx + 2 * ebp] + movdqa xmm0, xmm2 // use xmm0 as the temp register. + punpckldq xmm2, xmm6 + movlpd qword ptr [edx], xmm2 + movhpd qword ptr [ebx], xmm2 + punpckhdq xmm0, xmm6 + movlpd qword ptr [edx + esi], xmm0 + lea edx, [edx + 2 * esi] + movhpd qword ptr [ebx + ebp], xmm0 + lea ebx, [ebx + 2 * ebp] + movdqa xmm0, xmm1 // use xmm0 as the temp register. + punpckldq xmm1, xmm5 + movlpd qword ptr [edx], xmm1 + movhpd qword ptr [ebx], xmm1 + punpckhdq xmm0, xmm5 + movlpd qword ptr [edx + esi], xmm0 + lea edx, [edx + 2 * esi] + movhpd qword ptr [ebx + ebp], xmm0 + lea ebx, [ebx + 2 * ebp] + movdqa xmm0, xmm3 // use xmm0 as the temp register. + punpckldq xmm3, xmm7 + movlpd qword ptr [edx], xmm3 + movhpd qword ptr [ebx], xmm3 + punpckhdq xmm0, xmm7 + movlpd qword ptr [edx + esi], xmm0 + lea edx, [edx + 2 * esi] + movhpd qword ptr [ebx + ebp], xmm0 + lea ebx, [ebx + 2 * ebp] + sub ecx, 8 + ja convertloop + + mov esp, [esp + 16] + pop ebp + pop edi + pop esi + pop ebx + ret + } +} +#elif (defined(__i386__) || defined(__x86_64__)) && \ + !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) +#define HAS_TRANSPOSE_WX8_SSSE3 +static void TransposeWx8_SSSE3(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width) { + asm volatile( +"1:" + // Read in the data from the source pointer. + // First round of bit swap. + "movq (%0),%%xmm0\n" + "movq (%0,%3),%%xmm1\n" + "lea (%0,%3,2),%0\n" + "punpcklbw %%xmm1,%%xmm0\n" + "movq (%0),%%xmm2\n" + "movdqa %%xmm0,%%xmm1\n" + "palignr $0x8,%%xmm1,%%xmm1\n" + "movq (%0,%3),%%xmm3\n" + "lea (%0,%3,2),%0\n" + "punpcklbw %%xmm3,%%xmm2\n" + "movdqa %%xmm2,%%xmm3\n" + "movq (%0),%%xmm4\n" + "palignr $0x8,%%xmm3,%%xmm3\n" + "movq (%0,%3),%%xmm5\n" + "lea (%0,%3,2),%0\n" + "punpcklbw %%xmm5,%%xmm4\n" + "movdqa %%xmm4,%%xmm5\n" + "movq (%0),%%xmm6\n" + "palignr $0x8,%%xmm5,%%xmm5\n" + "movq (%0,%3),%%xmm7\n" + "lea (%0,%3,2),%0\n" + "punpcklbw %%xmm7,%%xmm6\n" + "neg %3\n" + "movdqa %%xmm6,%%xmm7\n" + "lea 0x8(%0,%3,8),%0\n" + "palignr $0x8,%%xmm7,%%xmm7\n" + "neg %3\n" + // Second round of bit swap. + "punpcklwd %%xmm2,%%xmm0\n" + "punpcklwd %%xmm3,%%xmm1\n" + "movdqa %%xmm0,%%xmm2\n" + "movdqa %%xmm1,%%xmm3\n" + "palignr $0x8,%%xmm2,%%xmm2\n" + "palignr $0x8,%%xmm3,%%xmm3\n" + "punpcklwd %%xmm6,%%xmm4\n" + "punpcklwd %%xmm7,%%xmm5\n" + "movdqa %%xmm4,%%xmm6\n" + "movdqa %%xmm5,%%xmm7\n" + "palignr $0x8,%%xmm6,%%xmm6\n" + "palignr $0x8,%%xmm7,%%xmm7\n" + // Third round of bit swap. + // Write to the destination pointer. + "punpckldq %%xmm4,%%xmm0\n" + "movq %%xmm0,(%1)\n" + "movdqa %%xmm0,%%xmm4\n" + "palignr $0x8,%%xmm4,%%xmm4\n" + "movq %%xmm4,(%1,%4)\n" + "lea (%1,%4,2),%1\n" + "punpckldq %%xmm6,%%xmm2\n" + "movdqa %%xmm2,%%xmm6\n" + "movq %%xmm2,(%1)\n" + "palignr $0x8,%%xmm6,%%xmm6\n" + "punpckldq %%xmm5,%%xmm1\n" + "movq %%xmm6,(%1,%4)\n" + "lea (%1,%4,2),%1\n" + "movdqa %%xmm1,%%xmm5\n" + "movq %%xmm1,(%1)\n" + "palignr $0x8,%%xmm5,%%xmm5\n" + "movq %%xmm5,(%1,%4)\n" + "lea (%1,%4,2),%1\n" + "punpckldq %%xmm7,%%xmm3\n" + "movq %%xmm3,(%1)\n" + "movdqa %%xmm3,%%xmm7\n" + "palignr $0x8,%%xmm7,%%xmm7\n" + "movq %%xmm7,(%1,%4)\n" + "lea (%1,%4,2),%1\n" + "sub $0x8,%2\n" + "ja 1b\n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(static_cast<intptr_t>(src_stride)), // %3 + "r"(static_cast<intptr_t>(dst_stride)) // %4 + : "memory" +); +} + +#if defined (__i386__) +#define HAS_TRANSPOSE_UVWX8_SSE2 +extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int w); + asm( + ".text\n" +#if defined(OSX) + ".globl _TransposeUVWx8_SSE2\n" +"_TransposeUVWx8_SSE2:\n" +#else + ".global TransposeUVWx8_SSE2\n" +"TransposeUVWx8_SSE2:\n" +#endif + "push %ebx\n" + "push %esi\n" + "push %edi\n" + "push %ebp\n" + "mov 0x14(%esp),%eax\n" + "mov 0x18(%esp),%edi\n" + "mov 0x1c(%esp),%edx\n" + "mov 0x20(%esp),%esi\n" + "mov 0x24(%esp),%ebx\n" + "mov 0x28(%esp),%ebp\n" + "mov %esp,%ecx\n" + "sub $0x14,%esp\n" + "and $0xfffffff0,%esp\n" + "mov %ecx,0x10(%esp)\n" + "mov 0x2c(%ecx),%ecx\n" + +"1:" + "movdqa (%eax),%xmm0\n" + "movdqa (%eax,%edi,1),%xmm1\n" + "lea (%eax,%edi,2),%eax\n" + "movdqa %xmm0,%xmm7\n" + "punpcklbw %xmm1,%xmm0\n" + "punpckhbw %xmm1,%xmm7\n" + "movdqa %xmm7,%xmm1\n" + "movdqa (%eax),%xmm2\n" + "movdqa (%eax,%edi,1),%xmm3\n" + "lea (%eax,%edi,2),%eax\n" + "movdqa %xmm2,%xmm7\n" + "punpcklbw %xmm3,%xmm2\n" + "punpckhbw %xmm3,%xmm7\n" + "movdqa %xmm7,%xmm3\n" + "movdqa (%eax),%xmm4\n" + "movdqa (%eax,%edi,1),%xmm5\n" + "lea (%eax,%edi,2),%eax\n" + "movdqa %xmm4,%xmm7\n" + "punpcklbw %xmm5,%xmm4\n" + "punpckhbw %xmm5,%xmm7\n" + "movdqa %xmm7,%xmm5\n" + "movdqa (%eax),%xmm6\n" + "movdqa (%eax,%edi,1),%xmm7\n" + "lea (%eax,%edi,2),%eax\n" + "movdqa %xmm5,(%esp)\n" + "neg %edi\n" + "movdqa %xmm6,%xmm5\n" + "punpcklbw %xmm7,%xmm6\n" + "punpckhbw %xmm7,%xmm5\n" + "movdqa %xmm5,%xmm7\n" + "lea 0x10(%eax,%edi,8),%eax\n" + "neg %edi\n" + "movdqa %xmm0,%xmm5\n" + "punpcklwd %xmm2,%xmm0\n" + "punpckhwd %xmm2,%xmm5\n" + "movdqa %xmm5,%xmm2\n" + "movdqa %xmm1,%xmm5\n" + "punpcklwd %xmm3,%xmm1\n" + "punpckhwd %xmm3,%xmm5\n" + "movdqa %xmm5,%xmm3\n" + "movdqa %xmm4,%xmm5\n" + "punpcklwd %xmm6,%xmm4\n" + "punpckhwd %xmm6,%xmm5\n" + "movdqa %xmm5,%xmm6\n" + "movdqa (%esp),%xmm5\n" + "movdqa %xmm6,(%esp)\n" + "movdqa %xmm5,%xmm6\n" + "punpcklwd %xmm7,%xmm5\n" + "punpckhwd %xmm7,%xmm6\n" + "movdqa %xmm6,%xmm7\n" + "movdqa %xmm0,%xmm6\n" + "punpckldq %xmm4,%xmm0\n" + "punpckhdq %xmm4,%xmm6\n" + "movdqa %xmm6,%xmm4\n" + "movdqa (%esp),%xmm6\n" + "movlpd %xmm0,(%edx)\n" + "movhpd %xmm0,(%ebx)\n" + "movlpd %xmm4,(%edx,%esi,1)\n" + "lea (%edx,%esi,2),%edx\n" + "movhpd %xmm4,(%ebx,%ebp,1)\n" + "lea (%ebx,%ebp,2),%ebx\n" + "movdqa %xmm2,%xmm0\n" + "punpckldq %xmm6,%xmm2\n" + "movlpd %xmm2,(%edx)\n" + "movhpd %xmm2,(%ebx)\n" + "punpckhdq %xmm6,%xmm0\n" + "movlpd %xmm0,(%edx,%esi,1)\n" + "lea (%edx,%esi,2),%edx\n" + "movhpd %xmm0,(%ebx,%ebp,1)\n" + "lea (%ebx,%ebp,2),%ebx\n" + "movdqa %xmm1,%xmm0\n" + "punpckldq %xmm5,%xmm1\n" + "movlpd %xmm1,(%edx)\n" + "movhpd %xmm1,(%ebx)\n" + "punpckhdq %xmm5,%xmm0\n" + "movlpd %xmm0,(%edx,%esi,1)\n" + "lea (%edx,%esi,2),%edx\n" + "movhpd %xmm0,(%ebx,%ebp,1)\n" + "lea (%ebx,%ebp,2),%ebx\n" + "movdqa %xmm3,%xmm0\n" + "punpckldq %xmm7,%xmm3\n" + "movlpd %xmm3,(%edx)\n" + "movhpd %xmm3,(%ebx)\n" + "punpckhdq %xmm7,%xmm0\n" + "movlpd %xmm0,(%edx,%esi,1)\n" + "lea (%edx,%esi,2),%edx\n" + "movhpd %xmm0,(%ebx,%ebp,1)\n" + "lea (%ebx,%ebp,2),%ebx\n" + "sub $0x8,%ecx\n" + "ja 1b\n" + "mov 0x10(%esp),%esp\n" + "pop %ebp\n" + "pop %edi\n" + "pop %esi\n" + "pop %ebx\n" + "ret\n" +); +#elif defined (__x86_64__) +// 64 bit version has enough registers to do 16x8 to 8x16 at a time. +#define HAS_TRANSPOSE_WX8_FAST_SSSE3 +static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width) { + asm volatile( +"1:" + // Read in the data from the source pointer. + // First round of bit swap. + "movdqa (%0),%%xmm0\n" + "movdqa (%0,%3),%%xmm1\n" + "lea (%0,%3,2),%0\n" + "movdqa %%xmm0,%%xmm8\n" + "punpcklbw %%xmm1,%%xmm0\n" + "punpckhbw %%xmm1,%%xmm8\n" + "movdqa (%0),%%xmm2\n" + "movdqa %%xmm0,%%xmm1\n" + "movdqa %%xmm8,%%xmm9\n" + "palignr $0x8,%%xmm1,%%xmm1\n" + "palignr $0x8,%%xmm9,%%xmm9\n" + "movdqa (%0,%3),%%xmm3\n" + "lea (%0,%3,2),%0\n" + "movdqa %%xmm2,%%xmm10\n" + "punpcklbw %%xmm3,%%xmm2\n" + "punpckhbw %%xmm3,%%xmm10\n" + "movdqa %%xmm2,%%xmm3\n" + "movdqa %%xmm10,%%xmm11\n" + "movdqa (%0),%%xmm4\n" + "palignr $0x8,%%xmm3,%%xmm3\n" + "palignr $0x8,%%xmm11,%%xmm11\n" + "movdqa (%0,%3),%%xmm5\n" + "lea (%0,%3,2),%0\n" + "movdqa %%xmm4,%%xmm12\n" + "punpcklbw %%xmm5,%%xmm4\n" + "punpckhbw %%xmm5,%%xmm12\n" + "movdqa %%xmm4,%%xmm5\n" + "movdqa %%xmm12,%%xmm13\n" + "movdqa (%0),%%xmm6\n" + "palignr $0x8,%%xmm5,%%xmm5\n" + "palignr $0x8,%%xmm13,%%xmm13\n" + "movdqa (%0,%3),%%xmm7\n" + "lea (%0,%3,2),%0\n" + "movdqa %%xmm6,%%xmm14\n" + "punpcklbw %%xmm7,%%xmm6\n" + "punpckhbw %%xmm7,%%xmm14\n" + "neg %3\n" + "movdqa %%xmm6,%%xmm7\n" + "movdqa %%xmm14,%%xmm15\n" + "lea 0x10(%0,%3,8),%0\n" + "palignr $0x8,%%xmm7,%%xmm7\n" + "palignr $0x8,%%xmm15,%%xmm15\n" + "neg %3\n" + // Second round of bit swap. + "punpcklwd %%xmm2,%%xmm0\n" + "punpcklwd %%xmm3,%%xmm1\n" + "movdqa %%xmm0,%%xmm2\n" + "movdqa %%xmm1,%%xmm3\n" + "palignr $0x8,%%xmm2,%%xmm2\n" + "palignr $0x8,%%xmm3,%%xmm3\n" + "punpcklwd %%xmm6,%%xmm4\n" + "punpcklwd %%xmm7,%%xmm5\n" + "movdqa %%xmm4,%%xmm6\n" + "movdqa %%xmm5,%%xmm7\n" + "palignr $0x8,%%xmm6,%%xmm6\n" + "palignr $0x8,%%xmm7,%%xmm7\n" + "punpcklwd %%xmm10,%%xmm8\n" + "punpcklwd %%xmm11,%%xmm9\n" + "movdqa %%xmm8,%%xmm10\n" + "movdqa %%xmm9,%%xmm11\n" + "palignr $0x8,%%xmm10,%%xmm10\n" + "palignr $0x8,%%xmm11,%%xmm11\n" + "punpcklwd %%xmm14,%%xmm12\n" + "punpcklwd %%xmm15,%%xmm13\n" + "movdqa %%xmm12,%%xmm14\n" + "movdqa %%xmm13,%%xmm15\n" + "palignr $0x8,%%xmm14,%%xmm14\n" + "palignr $0x8,%%xmm15,%%xmm15\n" + // Third round of bit swap. + // Write to the destination pointer. + "punpckldq %%xmm4,%%xmm0\n" + "movq %%xmm0,(%1)\n" + "movdqa %%xmm0,%%xmm4\n" + "palignr $0x8,%%xmm4,%%xmm4\n" + "movq %%xmm4,(%1,%4)\n" + "lea (%1,%4,2),%1\n" + "punpckldq %%xmm6,%%xmm2\n" + "movdqa %%xmm2,%%xmm6\n" + "movq %%xmm2,(%1)\n" + "palignr $0x8,%%xmm6,%%xmm6\n" + "punpckldq %%xmm5,%%xmm1\n" + "movq %%xmm6,(%1,%4)\n" + "lea (%1,%4,2),%1\n" + "movdqa %%xmm1,%%xmm5\n" + "movq %%xmm1,(%1)\n" + "palignr $0x8,%%xmm5,%%xmm5\n" + "movq %%xmm5,(%1,%4)\n" + "lea (%1,%4,2),%1\n" + "punpckldq %%xmm7,%%xmm3\n" + "movq %%xmm3,(%1)\n" + "movdqa %%xmm3,%%xmm7\n" + "palignr $0x8,%%xmm7,%%xmm7\n" + "movq %%xmm7,(%1,%4)\n" + "lea (%1,%4,2),%1\n" + "punpckldq %%xmm12,%%xmm8\n" + "movq %%xmm8,(%1)\n" + "movdqa %%xmm8,%%xmm12\n" + "palignr $0x8,%%xmm12,%%xmm12\n" + "movq %%xmm12,(%1,%4)\n" + "lea (%1,%4,2),%1\n" + "punpckldq %%xmm14,%%xmm10\n" + "movdqa %%xmm10,%%xmm14\n" + "movq %%xmm10,(%1)\n" + "palignr $0x8,%%xmm14,%%xmm14\n" + "punpckldq %%xmm13,%%xmm9\n" + "movq %%xmm14,(%1,%4)\n" + "lea (%1,%4,2),%1\n" + "movdqa %%xmm9,%%xmm13\n" + "movq %%xmm9,(%1)\n" + "palignr $0x8,%%xmm13,%%xmm13\n" + "movq %%xmm13,(%1,%4)\n" + "lea (%1,%4,2),%1\n" + "punpckldq %%xmm15,%%xmm11\n" + "movq %%xmm11,(%1)\n" + "movdqa %%xmm11,%%xmm15\n" + "palignr $0x8,%%xmm15,%%xmm15\n" + "movq %%xmm15,(%1,%4)\n" + "lea (%1,%4,2),%1\n" + "sub $0x10,%2\n" + "ja 1b\n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(static_cast<intptr_t>(src_stride)), // %3 + "r"(static_cast<intptr_t>(dst_stride)) // %4 + : "memory" +); +} + +#define HAS_TRANSPOSE_UVWX8_SSE2 +static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int w) { + asm volatile( +"1:" + // Read in the data from the source pointer. + // First round of bit swap. + "movdqa (%0),%%xmm0\n" + "movdqa (%0,%4),%%xmm1\n" + "lea (%0,%4,2),%0\n" + "movdqa %%xmm0,%%xmm8\n" + "punpcklbw %%xmm1,%%xmm0\n" + "punpckhbw %%xmm1,%%xmm8\n" + "movdqa %%xmm8,%%xmm1\n" + "movdqa (%0),%%xmm2\n" + "movdqa (%0,%4),%%xmm3\n" + "lea (%0,%4,2),%0\n" + "movdqa %%xmm2,%%xmm8\n" + "punpcklbw %%xmm3,%%xmm2\n" + "punpckhbw %%xmm3,%%xmm8\n" + "movdqa %%xmm8,%%xmm3\n" + "movdqa (%0),%%xmm4\n" + "movdqa (%0,%4),%%xmm5\n" + "lea (%0,%4,2),%0\n" + "movdqa %%xmm4,%%xmm8\n" + "punpcklbw %%xmm5,%%xmm4\n" + "punpckhbw %%xmm5,%%xmm8\n" + "movdqa %%xmm8,%%xmm5\n" + "movdqa (%0),%%xmm6\n" + "movdqa (%0,%4),%%xmm7\n" + "lea (%0,%4,2),%0\n" + "movdqa %%xmm6,%%xmm8\n" + "punpcklbw %%xmm7,%%xmm6\n" + "neg %4\n" + "lea 0x10(%0,%4,8),%0\n" + "punpckhbw %%xmm7,%%xmm8\n" + "movdqa %%xmm8,%%xmm7\n" + "neg %4\n" + // Second round of bit swap. + "movdqa %%xmm0,%%xmm8\n" + "movdqa %%xmm1,%%xmm9\n" + "punpckhwd %%xmm2,%%xmm8\n" + "punpckhwd %%xmm3,%%xmm9\n" + "punpcklwd %%xmm2,%%xmm0\n" + "punpcklwd %%xmm3,%%xmm1\n" + "movdqa %%xmm8,%%xmm2\n" + "movdqa %%xmm9,%%xmm3\n" + "movdqa %%xmm4,%%xmm8\n" + "movdqa %%xmm5,%%xmm9\n" + "punpckhwd %%xmm6,%%xmm8\n" + "punpckhwd %%xmm7,%%xmm9\n" + "punpcklwd %%xmm6,%%xmm4\n" + "punpcklwd %%xmm7,%%xmm5\n" + "movdqa %%xmm8,%%xmm6\n" + "movdqa %%xmm9,%%xmm7\n" + // Third round of bit swap. + // Write to the destination pointer. + "movdqa %%xmm0,%%xmm8\n" + "punpckldq %%xmm4,%%xmm0\n" + "movlpd %%xmm0,(%1)\n" // Write back U channel + "movhpd %%xmm0,(%2)\n" // Write back V channel + "punpckhdq %%xmm4,%%xmm8\n" + "movlpd %%xmm8,(%1,%5)\n" + "lea (%1,%5,2),%1\n" + "movhpd %%xmm8,(%2,%6)\n" + "lea (%2,%6,2),%2\n" + "movdqa %%xmm2,%%xmm8\n" + "punpckldq %%xmm6,%%xmm2\n" + "movlpd %%xmm2,(%1)\n" + "movhpd %%xmm2,(%2)\n" + "punpckhdq %%xmm6,%%xmm8\n" + "movlpd %%xmm8,(%1,%5)\n" + "lea (%1,%5,2),%1\n" + "movhpd %%xmm8,(%2,%6)\n" + "lea (%2,%6,2),%2\n" + "movdqa %%xmm1,%%xmm8\n" + "punpckldq %%xmm5,%%xmm1\n" + "movlpd %%xmm1,(%1)\n" + "movhpd %%xmm1,(%2)\n" + "punpckhdq %%xmm5,%%xmm8\n" + "movlpd %%xmm8,(%1,%5)\n" + "lea (%1,%5,2),%1\n" + "movhpd %%xmm8,(%2,%6)\n" + "lea (%2,%6,2),%2\n" + "movdqa %%xmm3,%%xmm8\n" + "punpckldq %%xmm7,%%xmm3\n" + "movlpd %%xmm3,(%1)\n" + "movhpd %%xmm3,(%2)\n" + "punpckhdq %%xmm7,%%xmm8\n" + "movlpd %%xmm8,(%1,%5)\n" + "lea (%1,%5,2),%1\n" + "movhpd %%xmm8,(%2,%6)\n" + "lea (%2,%6,2),%2\n" + "sub $0x8,%3\n" + "ja 1b\n" + : "+r"(src), // %0 + "+r"(dst_a), // %1 + "+r"(dst_b), // %2 + "+r"(w) // %3 + : "r"(static_cast<intptr_t>(src_stride)), // %4 + "r"(static_cast<intptr_t>(dst_stride_a)), // %5 + "r"(static_cast<intptr_t>(dst_stride_b)) // %6 + : "memory" +); +} +#endif +#endif + +static void TransposeWx8_C(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int w) { + int i; + for (i = 0; i < w; ++i) { + dst[0] = src[0 * src_stride]; + dst[1] = src[1 * src_stride]; + dst[2] = src[2 * src_stride]; + dst[3] = src[3 * src_stride]; + dst[4] = src[4 * src_stride]; + dst[5] = src[5 * src_stride]; + dst[6] = src[6 * src_stride]; + dst[7] = src[7 * src_stride]; + ++src; + dst += dst_stride; + } +} + +static void TransposeWxH_C(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height) { + int i, j; + for (i = 0; i < width; ++i) + for (j = 0; j < height; ++j) + dst[i * dst_stride + j] = src[j * src_stride + i]; +} + +void TransposePlane(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height) { + int i = height; + rotate_wx8_func TransposeWx8; + rotate_wxh_func TransposeWxH; + +#if defined(HAS_TRANSPOSE_WX8_NEON) + if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) && + (width % 8 == 0) && + IS_ALIGNED(src, 8) && (src_stride % 8 == 0) && + IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) { + TransposeWx8 = TransposeWx8_NEON; + TransposeWxH = TransposeWxH_C; + } else +#endif +#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 16 == 0) && + IS_ALIGNED(src, 16) && (src_stride % 16 == 0) && + IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) { + TransposeWx8 = TransposeWx8_FAST_SSSE3; + TransposeWxH = TransposeWxH_C; + } else +#endif +#if defined(HAS_TRANSPOSE_WX8_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 8 == 0) && + IS_ALIGNED(src, 8) && (src_stride % 8 == 0) && + IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) { + TransposeWx8 = TransposeWx8_SSSE3; + TransposeWxH = TransposeWxH_C; + } else +#endif + { + TransposeWx8 = TransposeWx8_C; + TransposeWxH = TransposeWxH_C; + } + + // work across the source in 8x8 tiles + while (i >= 8) { + TransposeWx8(src, src_stride, dst, dst_stride, width); + + src += 8 * src_stride; // go down 8 rows + dst += 8; // move over 8 columns + i -= 8; + } + + TransposeWxH(src, src_stride, dst, dst_stride, width, i); +} + +void RotatePlane90(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height) { + // Rotate by 90 is a transpose with the source read + // from bottom to top. So set the source pointer to the end + // of the buffer and flip the sign of the source stride. + src += src_stride * (height - 1); + src_stride = -src_stride; + + TransposePlane(src, src_stride, dst, dst_stride, width, height); +} + +void RotatePlane270(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height) { + // Rotate by 270 is a transpose with the destination written + // from bottom to top. So set the destination pointer to the end + // of the buffer and flip the sign of the destination stride. + dst += dst_stride * (width - 1); + dst_stride = -dst_stride; + + TransposePlane(src, src_stride, dst, dst_stride, width, height); +} + +static void ReverseLine_C(const uint8* src, uint8* dst, int width) { + int i; + src += width - 1; + for (i = 0; i < width; ++i) { + dst[i] = src[0]; + --src; + } +} + +#if defined(WIN32) && !defined(COVERAGE_ENABLED) +#define HAS_REVERSE_LINE_SSSE3 +__declspec(naked) +static void ReverseLine_SSSE3(const uint8* src, uint8* dst, int width) { +__asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + movdqa xmm7, _kShuffleReverse + lea eax, [eax + ecx - 16] + convertloop : + movdqa xmm0, [eax] + lea eax, [eax - 16] + pshufb xmm0, xmm7 + movdqa [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + ja convertloop + ret + } +} + +#elif (defined(__i386__) || defined(__x86_64__)) && \ + !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) +#define HAS_REVERSE_LINE_SSSE3 +static void ReverseLine_SSSE3(const uint8* src, uint8* dst, int width) { + intptr_t temp_width = static_cast<intptr_t>(width); + asm volatile( + "movdqa (%3),%%xmm7\n" + "lea -0x10(%0,%2,1),%0\n" +"1:" + "movdqa (%0),%%xmm0\n" + "lea -0x10(%0),%0\n" + "pshufb %%xmm7,%%xmm0\n" + "movdqa %%xmm0,(%1)\n" + "lea 0x10(%1),%1\n" + "sub $0x10,%2\n" + "ja 1b\n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "r"(kShuffleReverse) // %3 + : "memory" +); +} +#endif + +void RotatePlane180(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height) { + int i; + reverse_func ReverseLine; + +#if defined(HAS_REVERSE_LINE_NEON) + if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) && + (width % 16 == 0) && + IS_ALIGNED(src, 16) && (src_stride % 16 == 0) && + IS_ALIGNED(dst, 16) && (dst_stride % 16 == 0)) { + ReverseLine = ReverseLine_NEON; + } else +#endif +#if defined(HAS_REVERSE_LINE_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 16 == 0) && + IS_ALIGNED(src, 16) && (src_stride % 16 == 0) && + IS_ALIGNED(dst, 16) && (dst_stride % 16 == 0)) { + ReverseLine = ReverseLine_SSSE3; + } else +#endif + { + ReverseLine = ReverseLine_C; + } + // Rotate by 180 is a mirror and vertical flip + src += src_stride * (height - 1); + + for (i = 0; i < height; ++i) { + ReverseLine(src, dst, width); + src -= src_stride; + dst += dst_stride; + } +} + +static void TransposeUVWx8_C(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int w) { + int i; + for (i = 0; i < w; ++i) { + dst_a[0] = src[0 * src_stride + 0]; + dst_b[0] = src[0 * src_stride + 1]; + dst_a[1] = src[1 * src_stride + 0]; + dst_b[1] = src[1 * src_stride + 1]; + dst_a[2] = src[2 * src_stride + 0]; + dst_b[2] = src[2 * src_stride + 1]; + dst_a[3] = src[3 * src_stride + 0]; + dst_b[3] = src[3 * src_stride + 1]; + dst_a[4] = src[4 * src_stride + 0]; + dst_b[4] = src[4 * src_stride + 1]; + dst_a[5] = src[5 * src_stride + 0]; + dst_b[5] = src[5 * src_stride + 1]; + dst_a[6] = src[6 * src_stride + 0]; + dst_b[6] = src[6 * src_stride + 1]; + dst_a[7] = src[7 * src_stride + 0]; + dst_b[7] = src[7 * src_stride + 1]; + src += 2; + dst_a += dst_stride_a; + dst_b += dst_stride_b; + } +} + +static void TransposeUVWxH_C(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int w, int h) { + int i, j; + for (i = 0; i < w * 2; i += 2) + for (j = 0; j < h; ++j) { + dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)]; + dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1]; + } +} + +void TransposeUV(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height) { + int i = height; + rotate_uv_wx8_func TransposeWx8; + rotate_uv_wxh_func TransposeWxH; + +#if defined(HAS_TRANSPOSE_UVWX8_NEON) + unsigned long long store_reg[8]; + if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON)) { + SaveRegisters_NEON(store_reg); + TransposeWx8 = TransposeUVWx8_NEON; + TransposeWxH = TransposeUVWxH_C; + } else +#endif +#if defined(HAS_TRANSPOSE_UVWX8_SSE2) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && + (width % 8 == 0) && + IS_ALIGNED(src, 16) && (src_stride % 16 == 0) && + IS_ALIGNED(dst_a, 8) && (dst_stride_a % 8 == 0) && + IS_ALIGNED(dst_b, 8) && (dst_stride_b % 8 == 0)) { + TransposeWx8 = TransposeUVWx8_SSE2; + TransposeWxH = TransposeUVWxH_C; + } else +#endif + { + TransposeWx8 = TransposeUVWx8_C; + TransposeWxH = TransposeUVWxH_C; + } + + // work through the source in 8x8 tiles + while (i >= 8) { + TransposeWx8(src, src_stride, + dst_a, dst_stride_a, + dst_b, dst_stride_b, + width); + + src += 8 * src_stride; // go down 8 rows + dst_a += 8; // move over 8 columns + dst_b += 8; // move over 8 columns + i -= 8; + } + + TransposeWxH(src, src_stride, + dst_a, dst_stride_a, + dst_b, dst_stride_b, + width, i); + +#if defined(HAS_TRANSPOSE_UVWX8_NEON) + if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON)) { + RestoreRegisters_NEON(store_reg); + } +#endif +} + +void RotateUV90(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height) { + src += src_stride * (height - 1); + src_stride = -src_stride; + + TransposeUV(src, src_stride, + dst_a, dst_stride_a, + dst_b, dst_stride_b, + width, height); +} + +void RotateUV270(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height) { + dst_a += dst_stride_a * (width - 1); + dst_b += dst_stride_b * (width - 1); + dst_stride_a = -dst_stride_a; + dst_stride_b = -dst_stride_b; + + TransposeUV(src, src_stride, + dst_a, dst_stride_a, + dst_b, dst_stride_b, + width, height); +} + +#if defined(WIN32) && !defined(COVERAGE_ENABLED) +#define HAS_REVERSE_LINE_UV_SSSE3 +__declspec(naked) +void ReverseLineUV_SSSE3(const uint8* src, + uint8* dst_a, uint8* dst_b, + int width) { +__asm { + push edi + mov eax, [esp + 4 + 4] // src + mov edx, [esp + 4 + 8] // dst_a + mov edi, [esp + 4 + 12] // dst_b + mov ecx, [esp + 4 + 16] // width + movdqa xmm7, _kShuffleReverseUV + lea eax, [eax + ecx * 2 - 16] + + convertloop : + movdqa xmm0, [eax] + lea eax, [eax - 16] + pshufb xmm0, xmm7 + movlpd qword ptr [edx], xmm0 + lea edx, [edx + 8] + movhpd qword ptr [edi], xmm0 + lea edi, [edi + 8] + sub ecx, 8 + ja convertloop + pop edi + ret + } +} + +#elif (defined(__i386__) || defined(__x86_64__)) && \ + !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) +#define HAS_REVERSE_LINE_UV_SSSE3 +void ReverseLineUV_SSSE3(const uint8* src, + uint8* dst_a, uint8* dst_b, + int width) { + intptr_t temp_width = static_cast<intptr_t>(width); + asm volatile( + "movdqa (%4),%%xmm7\n" + "lea -0x10(%0,%3,2),%0\n" +"1:" + "movdqa (%0),%%xmm0\n" + "lea -0x10(%0),%0\n" + "pshufb %%xmm7,%%xmm0\n" + "movlpd %%xmm0,(%1)\n" + "lea 0x8(%1),%1\n" + "movhpd %%xmm0,(%2)\n" + "lea 0x8(%2),%2\n" + "sub $0x8,%3\n" + "ja 1b\n" + : "+r"(src), // %0 + "+r"(dst_a), // %1 + "+r"(dst_b), // %2 + "+r"(temp_width) // %3 + : "r"(kShuffleReverseUV) // %4 + : "memory" +); +} +#endif + +static void ReverseLineUV_C(const uint8* src, + uint8* dst_a, uint8* dst_b, + int width) { + int i; + src += width << 1; + for (i = 0; i < width; ++i) { + src -= 2; + dst_a[i] = src[0]; + dst_b[i] = src[1]; + } +} + +void RotateUV180(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height) { + int i; + reverse_uv_func ReverseLine; + +#if defined(HAS_REVERSE_LINE_UV_NEON) + if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) && + (width % 16 == 0) && + IS_ALIGNED(src, 16) && (src_stride % 16 == 0) && + IS_ALIGNED(dst_a, 8) && (dst_stride_a % 8 == 0) && + IS_ALIGNED(dst_b, 8) && (dst_stride_b % 8 == 0) ) { + ReverseLine = ReverseLineUV_NEON; + } else +#endif +#if defined(HAS_REVERSE_LINE_UV_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 16 == 0) && + IS_ALIGNED(src, 16) && (src_stride % 16 == 0) && + IS_ALIGNED(dst_a, 8) && (dst_stride_a % 8 == 0) && + IS_ALIGNED(dst_b, 8) && (dst_stride_b % 8 == 0) ) { + ReverseLine = ReverseLineUV_SSSE3; + } else +#endif + { + ReverseLine = ReverseLineUV_C; + } + + dst_a += dst_stride_a * (height - 1); + dst_b += dst_stride_b * (height - 1); + + for (i = 0; i < height; ++i) { + ReverseLine(src, dst_a, dst_b, width); + + src += src_stride; // down one line at a time + dst_a -= dst_stride_a; // nominally up one line at a time + dst_b -= dst_stride_b; // nominally up one line at a time + } +} + +int I420Rotate(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height, + RotationMode mode) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + switch (mode) { + case kRotate0: + // copy frame + return I420Copy(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height); + case kRotate90: + RotatePlane90(src_y, src_stride_y, + dst_y, dst_stride_y, + width, height); + RotatePlane90(src_u, src_stride_u, + dst_u, dst_stride_u, + halfwidth, halfheight); + RotatePlane90(src_v, src_stride_v, + dst_v, dst_stride_v, + halfwidth, halfheight); + return 0; + case kRotate270: + RotatePlane270(src_y, src_stride_y, + dst_y, dst_stride_y, + width, height); + RotatePlane270(src_u, src_stride_u, + dst_u, dst_stride_u, + halfwidth, halfheight); + RotatePlane270(src_v, src_stride_v, + dst_v, dst_stride_v, + halfwidth, halfheight); + return 0; + case kRotate180: + RotatePlane180(src_y, src_stride_y, + dst_y, dst_stride_y, + width, height); + RotatePlane180(src_u, src_stride_u, + dst_u, dst_stride_u, + halfwidth, halfheight); + RotatePlane180(src_v, src_stride_v, + dst_v, dst_stride_v, + halfwidth, halfheight); + return 0; + default: + break; + } + return -1; +} + +int NV12ToI420Rotate(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height, + RotationMode mode) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_uv = src_uv + (halfheight - 1) * src_stride_uv; + src_stride_y = -src_stride_y; + src_stride_uv = -src_stride_uv; + } + + switch (mode) { + case kRotate0: + // copy frame + return NV12ToI420(src_y, src_uv, src_stride_y, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height); + case kRotate90: + RotatePlane90(src_y, src_stride_y, + dst_y, dst_stride_y, + width, height); + RotateUV90(src_uv, src_stride_uv, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + halfwidth, halfheight); + return 0; + case kRotate270: + RotatePlane270(src_y, src_stride_y, + dst_y, dst_stride_y, + width, height); + RotateUV270(src_uv, src_stride_uv, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + halfwidth, halfheight); + return 0; + case kRotate180: + RotatePlane180(src_y, src_stride_y, + dst_y, dst_stride_y, + width, height); + RotateUV180(src_uv, src_stride_uv, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + halfwidth, halfheight); + return 0; + default: + break; + } + return -1; +} + +} // namespace libyuv diff --git a/files/source/rotate_neon.s b/files/source/rotate_neon.s new file mode 100644 index 00000000..75ea957a --- /dev/null +++ b/files/source/rotate_neon.s @@ -0,0 +1,563 @@ + .global RestoreRegisters_NEON + .global ReverseLine_NEON + .global ReverseLineUV_NEON + .global SaveRegisters_NEON + .global TransposeWx8_NEON + .global TransposeUVWx8_NEON + .type RestoreRegisters_NEON, function + .type ReverseLine_NEON, function + .type ReverseLineUV_NEON, function + .type SaveRegisters_NEON, function + .type TransposeWx8_NEON, function + .type TransposeUVWx8_NEON, function + +@ void ReverseLine_NEON (const uint8* src, uint8* dst, int width) +@ r0 const uint8* src +@ r1 uint8* dst +@ r2 width +ReverseLine_NEON: + + @ compute where to start writing destination + add r1, r2 @ dst + width + + @ work on segments that are multiples of 16 + lsrs r3, r2, #4 + + @ the output is written in two block. 8 bytes followed + @ by another 8. reading is done sequentially, from left to + @ right. writing is done from right to left in block sizes + @ r1, the destination pointer is incremented after writing + @ the first of the two blocks. need to subtract that 8 off + @ along with 16 to get the next location. + mov r3, #-24 + + beq Lline_residuals + + @ back of destination by the size of the register that is + @ going to be reversed + sub r1, #16 + + @ the loop needs to run on blocks of 16. what will be left + @ over is either a negative number, the residuals that need + @ to be done, or 0. if this isn't subtracted off here the + @ loop will run one extra time. + sub r2, #16 + +Lsegments_of_16: + vld1.8 {q0}, [r0]! @ src += 16 + + @ reverse the bytes in the 64 bit segments. unable to reverse + @ the bytes in the entire 128 bits in one go. + vrev64.8 q0, q0 + + @ because of the inability to reverse the entire 128 bits + @ reverse the writing out of the two 64 bit segments. + vst1.8 {d1}, [r1]! + vst1.8 {d0}, [r1], r3 @ dst -= 16 + + subs r2, #16 + bge Lsegments_of_16 + + @ add 16 back to the counter. if the result is 0 there is no + @ residuals so return + adds r2, #16 + bxeq lr + + add r1, #16 + +Lline_residuals: + + mov r3, #-3 + + sub r1, #2 + subs r2, #2 + @ check for 16*n+1 scenarios where segments_of_2 should not + @ be run, but there is something left over. + blt Lsegment_of_1 + +@ do this in neon registers as per +@ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/ +Lsegments_of_2: + vld2.8 {d0[0], d1[0]}, [r0]! @ src += 2 + + vst1.8 {d1[0]}, [r1]! + vst1.8 {d0[0]}, [r1], r3 @ dst -= 2 + + subs r2, #2 + bge Lsegments_of_2 + + adds r2, #2 + bxeq lr + +Lsegment_of_1: + add r1, #1 + vld1.8 {d0[0]}, [r0] + vst1.8 {d0[0]}, [r1] + + bx lr + +@ void TransposeWx8_NEON (const uint8* src, int src_stride, +@ uint8* dst, int dst_stride, +@ int w) +@ r0 const uint8* src +@ r1 int src_stride +@ r2 uint8* dst +@ r3 int dst_stride +@ stack int w +TransposeWx8_NEON: + push {r4,r8,r9,lr} + + ldr r8, [sp, #16] @ width + + @ loops are on blocks of 8. loop will stop when + @ counter gets to or below 0. starting the counter + @ at w-8 allow for this + sub r8, #8 + +@ handle 8x8 blocks. this should be the majority of the plane +Lloop_8x8: + mov r9, r0 + + vld1.8 {d0}, [r9], r1 + vld1.8 {d1}, [r9], r1 + vld1.8 {d2}, [r9], r1 + vld1.8 {d3}, [r9], r1 + vld1.8 {d4}, [r9], r1 + vld1.8 {d5}, [r9], r1 + vld1.8 {d6}, [r9], r1 + vld1.8 {d7}, [r9] + + vtrn.8 d1, d0 + vtrn.8 d3, d2 + vtrn.8 d5, d4 + vtrn.8 d7, d6 + + vtrn.16 d1, d3 + vtrn.16 d0, d2 + vtrn.16 d5, d7 + vtrn.16 d4, d6 + + vtrn.32 d1, d5 + vtrn.32 d0, d4 + vtrn.32 d3, d7 + vtrn.32 d2, d6 + + vrev16.8 q0, q0 + vrev16.8 q1, q1 + vrev16.8 q2, q2 + vrev16.8 q3, q3 + + mov r9, r2 + + vst1.8 {d1}, [r9], r3 + vst1.8 {d0}, [r9], r3 + vst1.8 {d3}, [r9], r3 + vst1.8 {d2}, [r9], r3 + vst1.8 {d5}, [r9], r3 + vst1.8 {d4}, [r9], r3 + vst1.8 {d7}, [r9], r3 + vst1.8 {d6}, [r9] + + add r0, #8 @ src += 8 + add r2, r3, lsl #3 @ dst += 8 * dst_stride + subs r8, #8 @ w -= 8 + bge Lloop_8x8 + + @ add 8 back to counter. if the result is 0 there are + @ no residuals. + adds r8, #8 + beq Ldone + + @ some residual, so between 1 and 7 lines left to transpose + cmp r8, #2 + blt Lblock_1x8 + + cmp r8, #4 + blt Lblock_2x8 + +Lblock_4x8: + mov r9, r0 + vld1.32 {d0[0]}, [r9], r1 + vld1.32 {d0[1]}, [r9], r1 + vld1.32 {d1[0]}, [r9], r1 + vld1.32 {d1[1]}, [r9], r1 + vld1.32 {d2[0]}, [r9], r1 + vld1.32 {d2[1]}, [r9], r1 + vld1.32 {d3[0]}, [r9], r1 + vld1.32 {d3[1]}, [r9] + + mov r9, r2 + + adr r12, vtbl_4x4_transpose + vld1.8 {q3}, [r12] + + vtbl.8 d4, {d0, d1}, d6 + vtbl.8 d5, {d0, d1}, d7 + vtbl.8 d0, {d2, d3}, d6 + vtbl.8 d1, {d2, d3}, d7 + + @ TODO: rework shuffle above to write + @ out with 4 instead of 8 writes + vst1.32 {d4[0]}, [r9], r3 + vst1.32 {d4[1]}, [r9], r3 + vst1.32 {d5[0]}, [r9], r3 + vst1.32 {d5[1]}, [r9] + + add r9, r2, #4 + vst1.32 {d0[0]}, [r9], r3 + vst1.32 {d0[1]}, [r9], r3 + vst1.32 {d1[0]}, [r9], r3 + vst1.32 {d1[1]}, [r9] + + add r0, #4 @ src += 4 + add r2, r3, lsl #2 @ dst += 4 * dst_stride + subs r8, #4 @ w -= 4 + beq Ldone + + @ some residual, check to see if it includes a 2x8 block, + @ or less + cmp r8, #2 + blt Lblock_1x8 + +Lblock_2x8: + mov r9, r0 + vld1.16 {d0[0]}, [r9], r1 + vld1.16 {d1[0]}, [r9], r1 + vld1.16 {d0[1]}, [r9], r1 + vld1.16 {d1[1]}, [r9], r1 + vld1.16 {d0[2]}, [r9], r1 + vld1.16 {d1[2]}, [r9], r1 + vld1.16 {d0[3]}, [r9], r1 + vld1.16 {d1[3]}, [r9] + + vtrn.8 d0, d1 + + mov r9, r2 + + vst1.64 {d0}, [r9], r3 + vst1.64 {d1}, [r9] + + add r0, #2 @ src += 2 + add r2, r3, lsl #1 @ dst += 2 * dst_stride + subs r8, #2 @ w -= 2 + beq Ldone + +Lblock_1x8: + vld1.8 {d0[0]}, [r0], r1 + vld1.8 {d0[1]}, [r0], r1 + vld1.8 {d0[2]}, [r0], r1 + vld1.8 {d0[3]}, [r0], r1 + vld1.8 {d0[4]}, [r0], r1 + vld1.8 {d0[5]}, [r0], r1 + vld1.8 {d0[6]}, [r0], r1 + vld1.8 {d0[7]}, [r0] + + vst1.64 {d0}, [r2] + +Ldone: + + pop {r4,r8,r9,pc} + +vtbl_4x4_transpose: + .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 + +@ void SaveRegisters_NEON (unsigned long long store) +@ r0 unsigned long long store +SaveRegisters_NEON: + vst1.i64 {d8, d9, d10, d11}, [r0]! + vst1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + +@ void RestoreRegisters_NEON (unsigned long long store) +@ r0 unsigned long long store +RestoreRegisters_NEON: + vld1.i64 {d8, d9, d10, d11}, [r0]! + vld1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + +@ void ReverseLineUV_NEON (const uint8* src, +@ uint8* dst_a, +@ uint8* dst_b, +@ int width) +@ r0 const uint8* src +@ r1 uint8* dst_a +@ r2 uint8* dst_b +@ r3 width +ReverseLineUV_NEON: + + @ compute where to start writing destination + add r1, r1, r3 @ dst_a + width + add r2, r2, r3 @ dst_b + width + + @ work on input segments that are multiples of 16, but + @ width that has been passed is output segments, half + @ the size of input. + lsrs r12, r3, #3 + + beq Lline_residuals_di + + @ the output is written in to two blocks. + mov r12, #-8 + + @ back of destination by the size of the register that is + @ going to be reversed + sub r1, r1, #8 + sub r2, r2, #8 + + @ the loop needs to run on blocks of 8. what will be left + @ over is either a negative number, the residuals that need + @ to be done, or 0. if this isn't subtracted off here the + @ loop will run one extra time. + sub r3, r3, #8 + +Lsegments_of_8_di: + vld2.8 {d0, d1}, [r0]! @ src += 16 + + @ reverse the bytes in the 64 bit segments + vrev64.8 q0, q0 + + vst1.8 {d0}, [r1], r12 @ dst_a -= 8 + vst1.8 {d1}, [r2], r12 @ dst_b -= 8 + + subs r3, r3, #8 + bge Lsegments_of_8_di + + @ add 8 back to the counter. if the result is 0 there is no + @ residuals so return + adds r3, r3, #8 + bxeq lr + + add r1, r1, #8 + add r2, r2, #8 + +Lline_residuals_di: + + mov r12, #-1 + + sub r1, r1, #1 + sub r2, r2, #1 + +@ do this in neon registers as per +@ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/ +Lsegments_of_1: + vld2.8 {d0[0], d1[0]}, [r0]! @ src += 2 + + vst1.8 {d0[0]}, [r1], r12 @ dst_a -= 1 + vst1.8 {d1[0]}, [r2], r12 @ dst_b -= 1 + + subs r3, r3, #1 + bgt Lsegments_of_1 + + bx lr + +@ void TransposeUVWx8_NEON (const uint8* src, int src_stride, +@ uint8* dst_a, int dst_stride_a, +@ uint8* dst_b, int dst_stride_b, +@ int width) +@ r0 const uint8* src +@ r1 int src_stride +@ r2 uint8* dst_a +@ r3 int dst_stride_a +@ stack uint8* dst_b +@ stack int dst_stride_b +@ stack int width +TransposeUVWx8_NEON: + push {r4-r9,lr} + + ldr r4, [sp, #28] @ dst_b + ldr r5, [sp, #32] @ dst_stride_b + ldr r8, [sp, #36] @ width + @ loops are on blocks of 8. loop will stop when + @ counter gets to or below 0. starting the counter + @ at w-8 allow for this + sub r8, #8 + +@ handle 8x8 blocks. this should be the majority of the plane +Lloop_8x8_di: + mov r9, r0 + + vld2.8 {d0, d1}, [r9], r1 + vld2.8 {d2, d3}, [r9], r1 + vld2.8 {d4, d5}, [r9], r1 + vld2.8 {d6, d7}, [r9], r1 + vld2.8 {d8, d9}, [r9], r1 + vld2.8 {d10, d11}, [r9], r1 + vld2.8 {d12, d13}, [r9], r1 + vld2.8 {d14, d15}, [r9] + + vtrn.8 q1, q0 + vtrn.8 q3, q2 + vtrn.8 q5, q4 + vtrn.8 q7, q6 + + vtrn.16 q1, q3 + vtrn.16 q0, q2 + vtrn.16 q5, q7 + vtrn.16 q4, q6 + + vtrn.32 q1, q5 + vtrn.32 q0, q4 + vtrn.32 q3, q7 + vtrn.32 q2, q6 + + vrev16.8 q0, q0 + vrev16.8 q1, q1 + vrev16.8 q2, q2 + vrev16.8 q3, q3 + vrev16.8 q4, q4 + vrev16.8 q5, q5 + vrev16.8 q6, q6 + vrev16.8 q7, q7 + + mov r9, r2 + + vst1.8 {d2}, [r9], r3 + vst1.8 {d0}, [r9], r3 + vst1.8 {d6}, [r9], r3 + vst1.8 {d4}, [r9], r3 + vst1.8 {d10}, [r9], r3 + vst1.8 {d8}, [r9], r3 + vst1.8 {d14}, [r9], r3 + vst1.8 {d12}, [r9] + + mov r9, r4 + + vst1.8 {d3}, [r9], r5 + vst1.8 {d1}, [r9], r5 + vst1.8 {d7}, [r9], r5 + vst1.8 {d5}, [r9], r5 + vst1.8 {d11}, [r9], r5 + vst1.8 {d9}, [r9], r5 + vst1.8 {d15}, [r9], r5 + vst1.8 {d13}, [r9] + + add r0, #8*2 @ src += 8*2 + add r2, r3, lsl #3 @ dst_a += 8 * dst_stride_a + add r4, r5, lsl #3 @ dst_b += 8 * dst_stride_b + subs r8, #8 @ w -= 8 + bge Lloop_8x8_di + + @ add 8 back to counter. if the result is 0 there are + @ no residuals. + adds r8, #8 + beq Ldone_di + + @ some residual, so between 1 and 7 lines left to transpose + cmp r8, #2 + blt Lblock_1x8_di + + cmp r8, #4 + blt Lblock_2x8_di + +@ TODO(frkoenig) : clean this up +Lblock_4x8_di: + mov r9, r0 + vld1.64 {d0}, [r9], r1 + vld1.64 {d1}, [r9], r1 + vld1.64 {d2}, [r9], r1 + vld1.64 {d3}, [r9], r1 + vld1.64 {d4}, [r9], r1 + vld1.64 {d5}, [r9], r1 + vld1.64 {d6}, [r9], r1 + vld1.64 {d7}, [r9] + + adr r12, vtbl_4x4_transpose_di + vld1.8 {q7}, [r12] + + vtrn.8 q0, q1 + vtrn.8 q2, q3 + + vtbl.8 d8, {d0, d1}, d14 + vtbl.8 d9, {d0, d1}, d15 + vtbl.8 d10, {d2, d3}, d14 + vtbl.8 d11, {d2, d3}, d15 + vtbl.8 d12, {d4, d5}, d14 + vtbl.8 d13, {d4, d5}, d15 + vtbl.8 d0, {d6, d7}, d14 + vtbl.8 d1, {d6, d7}, d15 + + mov r9, r2 + + vst1.32 {d8[0]}, [r9], r3 + vst1.32 {d8[1]}, [r9], r3 + vst1.32 {d9[0]}, [r9], r3 + vst1.32 {d9[1]}, [r9], r3 + + add r9, r2, #4 + vst1.32 {d12[0]}, [r9], r3 + vst1.32 {d12[1]}, [r9], r3 + vst1.32 {d13[0]}, [r9], r3 + vst1.32 {d13[1]}, [r9] + + mov r9, r4 + + vst1.32 {d10[0]}, [r9], r5 + vst1.32 {d10[1]}, [r9], r5 + vst1.32 {d11[0]}, [r9], r5 + vst1.32 {d11[1]}, [r9], r5 + + add r9, r4, #4 + vst1.32 {d0[0]}, [r9], r5 + vst1.32 {d0[1]}, [r9], r5 + vst1.32 {d1[0]}, [r9], r5 + vst1.32 {d1[1]}, [r9] + + add r0, #4*2 @ src += 4 * 2 + add r2, r3, lsl #2 @ dst_a += 4 * dst_stride_a + add r4, r5, lsl #2 @ dst_b += 4 * dst_stride_b + subs r8, #4 @ w -= 4 + beq Ldone_di + + @ some residual, check to see if it includes a 2x8 block, + @ or less + cmp r8, #2 + blt Lblock_1x8_di + +Lblock_2x8_di: + mov r9, r0 + vld2.16 {d0[0], d2[0]}, [r9], r1 + vld2.16 {d1[0], d3[0]}, [r9], r1 + vld2.16 {d0[1], d2[1]}, [r9], r1 + vld2.16 {d1[1], d3[1]}, [r9], r1 + vld2.16 {d0[2], d2[2]}, [r9], r1 + vld2.16 {d1[2], d3[2]}, [r9], r1 + vld2.16 {d0[3], d2[3]}, [r9], r1 + vld2.16 {d1[3], d3[3]}, [r9] + + vtrn.8 d0, d1 + vtrn.8 d2, d3 + + mov r9, r2 + + vst1.64 {d0}, [r9], r3 + vst1.64 {d2}, [r9] + + mov r9, r4 + + vst1.64 {d1}, [r9], r5 + vst1.64 {d3}, [r9] + + add r0, #2*2 @ src += 2 * 2 + add r2, r3, lsl #1 @ dst_a += 2 * dst_stride_a + add r4, r5, lsl #1 @ dst_a += 2 * dst_stride_a + subs r8, #2 @ w -= 2 + beq Ldone_di + +Lblock_1x8_di: + vld2.8 {d0[0], d1[0]}, [r0], r1 + vld2.8 {d0[1], d1[1]}, [r0], r1 + vld2.8 {d0[2], d1[2]}, [r0], r1 + vld2.8 {d0[3], d1[3]}, [r0], r1 + vld2.8 {d0[4], d1[4]}, [r0], r1 + vld2.8 {d0[5], d1[5]}, [r0], r1 + vld2.8 {d0[6], d1[6]}, [r0], r1 + vld2.8 {d0[7], d1[7]}, [r0] + + vst1.64 {d0}, [r2] + vst1.64 {d1}, [r4] + +Ldone_di: + pop {r4-r9, pc} + +vtbl_4x4_transpose_di: + .byte 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 diff --git a/files/source/rotate_priv.h b/files/source/rotate_priv.h new file mode 100644 index 00000000..b4df1494 --- /dev/null +++ b/files/source/rotate_priv.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef SOURCE_ROTATE_PRIV_H_ +#define SOURCE_ROTATE_PRIV_H_ + +#include "libyuv/basic_types.h" + +namespace libyuv { + +// Rotate planes by 90, 180, 270 +void +RotatePlane90(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height); + +void +RotatePlane180(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height); + +void +RotatePlane270(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height); + +void +RotateUV90(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height); + +// Rotations for when U and V are interleaved. +// These functions take one input pointer and +// split the data into two buffers while +// rotating them. +void +RotateUV180(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height); + +void +RotateUV270(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height); + +// The 90 and 270 functions are based on transposes. +// Doing a transpose with reversing the read/write +// order will result in a rotation by +- 90 degrees. +void +TransposePlane(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height); + +void +TransposeUV(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height); + +} // namespace libyuv + +#endif // SOURCE_ROTATE_PRIV_H_ diff --git a/files/source/row.h b/files/source/row.h new file mode 100644 index 00000000..85343c56 --- /dev/null +++ b/files/source/row.h @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef LIBYUV_SOURCE_ROW_H_ +#define LIBYUV_SOURCE_ROW_H_ + +#include "libyuv/basic_types.h" + +// The following are available on all x86 platforms +#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \ + && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) +#define HAS_ARGBTOYROW_SSSE3 +#define HAS_BG24TOARGBROW_SSSE3 +#define HAS_RAWTOARGBROW_SSSE3 +#define HAS_RGB24TOYROW_SSSE3 +#define HAS_RAWTOYROW_SSSE3 +#define HAS_RGB24TOUVROW_SSSE3 +#define HAS_RAWTOUVROW_SSSE3 +#endif + +// The following are available only on Windows +#if defined(WIN32) \ + && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) +#define HAS_BGRATOYROW_SSSE3 +#define HAS_ABGRTOYROW_SSSE3 +#define HAS_ARGBTOUVROW_SSSE3 +#define HAS_BGRATOUVROW_SSSE3 +#define HAS_ABGRTOUVROW_SSSE3 +#endif + +extern "C" { +#ifdef HAS_ARGBTOYROW_SSSE3 +void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +#endif +#if defined(HAS_BG24TOARGBROW_SSSE3) && defined(HAS_ARGBTOYROW_SSSE3) +#define HASRGB24TOYROW_SSSE3 +#endif +#ifdef HASRGB24TOYROW_SSSE3 +void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void RGB24ToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void RAWToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +#endif +void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); +void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); +void ABGRToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); +void RGB24ToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); +void RAWToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void BGRAToUVRow_C(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ABGRToUVRow_C(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void RGB24ToUVRow_C(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void RAWToUVRow_C(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); + +#ifdef HAS_BG24TOARGBROW_SSSE3 +void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix); +void RAWToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix); +#endif +void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix); +void RAWToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix); + +#if defined(_MSC_VER) +#define SIMD_ALIGNED(var) __declspec(align(16)) var +#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var +#else +#define SIMD_ALIGNED(var) var __attribute__((aligned(16))) +#define TALIGN16(t, var) t var __attribute__((aligned(16))) +#endif + +#ifdef OSX +extern SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]); +extern SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]); +extern SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]); +#else +extern SIMD_ALIGNED(const int16 _kCoefficientsRgbY[768][4]); +extern SIMD_ALIGNED(const int16 _kCoefficientsBgraY[768][4]); +extern SIMD_ALIGNED(const int16 _kCoefficientsAbgrY[768][4]); +#endif +void FastConvertYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUVToBGRARow(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUVToABGRRow(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUV444ToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYToRGB32Row(const uint8* y_buf, + uint8* rgb_buf, + int width); + +// Method to force C version. +//#define USE_MMX 0 +//#define USE_SSE2 0 + +#if !defined(USE_MMX) +// Windows, Mac and Linux use MMX +#if defined(__i386__) || defined(_MSC_VER) +#define USE_MMX 1 +#else +#define USE_MMX 0 +#endif +#endif + +#if !defined(USE_SSE2) +#if defined(__SSE2__) || defined(ARCH_CPU_X86_64) || _M_IX86_FP==2 +#define USE_SSE2 1 +#else +#define USE_SSE2 0 +#endif +#endif + +// x64 uses MMX2 (SSE) so emms is not required. +// Warning C4799: function has no EMMS instruction. +// EMMS() is slow and should be called by the calling function once per image. +#if USE_MMX && !defined(ARCH_CPU_X86_64) +#if defined(_MSC_VER) +#define EMMS() __asm emms +#pragma warning(disable: 4799) +#else +#define EMMS() asm("emms") +#endif +#else +#define EMMS() +#endif + + +} // extern "C" + +#endif // LIBYUV_SOURCE_ROW_H_ diff --git a/files/source/row_posix.cc b/files/source/row_posix.cc new file mode 100644 index 00000000..88ce475b --- /dev/null +++ b/files/source/row_posix.cc @@ -0,0 +1,659 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "row.h" + +extern "C" { + +#ifdef HAS_ARGBTOYROW_SSSE3 + +// Constant multiplication table for converting ARGB to I400. +extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400[16]) = { + 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u +}; + +extern "C" TALIGN16(const uint8, kAdd16[16]) = { + 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u +}; + +// Shuffle table for converting BG24 to ARGB. +extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = { + 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u +}; + +// Shuffle table for converting RAW to ARGB. +extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = { + 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u +}; + +void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile( + "movdqa (%3),%%xmm7\n" + "movdqa (%4),%%xmm6\n" + "movdqa %%xmm6,%%xmm5\n" + "psllw $0x4,%%xmm5\n" // Generate a mask of 0x10 on each byte. +"1:" + "movdqa (%0),%%xmm0\n" + "pmaddubsw %%xmm7,%%xmm0\n" + "movdqa 0x10(%0),%%xmm1\n" + "psrlw $0x7,%%xmm0\n" + "pmaddubsw %%xmm7,%%xmm1\n" + "lea 0x20(%0),%0\n" + "psrlw $0x7,%%xmm1\n" + "packuswb %%xmm1,%%xmm0\n" + "pmaddubsw %%xmm6,%%xmm0\n" + "packuswb %%xmm0,%%xmm0\n" + "paddb %%xmm5,%%xmm0\n" + "movq %%xmm0,(%1)\n" + "lea 0x8(%1),%1\n" + "sub $0x8,%2\n" + "ja 1b\n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "r"(kMultiplyMaskARGBToI400), // %3 + "r"(kAdd16) // %4 + : "memory" +); +} +#endif + +#ifdef HAS_BG24TOARGBROW_SSSE3 +void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) { + asm volatile( + "pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000 + "pslld $0x18,%%xmm7\n" + "movdqa (%3),%%xmm6\n" +"1:" + "movdqa (%0),%%xmm0\n" + "movdqa 0x10(%0),%%xmm1\n" + "movdqa 0x20(%0),%%xmm3\n" + "lea 0x30(%0),%0\n" + "movdqa %%xmm3,%%xmm2\n" + "palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] } + "pshufb %%xmm6,%%xmm2\n" + "por %%xmm7,%%xmm2\n" + "palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] } + "pshufb %%xmm6,%%xmm0\n" + "movdqa %%xmm2,0x20(%1)\n" + "por %%xmm7,%%xmm0\n" + "pshufb %%xmm6,%%xmm1\n" + "movdqa %%xmm0,(%1)\n" + "por %%xmm7,%%xmm1\n" + "palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] } + "pshufb %%xmm6,%%xmm3\n" + "movdqa %%xmm1,0x10(%1)\n" + "por %%xmm7,%%xmm3\n" + "movdqa %%xmm3,0x30(%1)\n" + "lea 0x40(%1),%1\n" + "sub $0x10,%2\n" + "ja 1b\n" + : "+r"(src_bg24), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "r"(kShuffleMaskBG24ToARGB) // %3 + : "memory" +); +} + +void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { + asm volatile( + "pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000 + "pslld $0x18,%%xmm7\n" + "movdqa (%3),%%xmm6\n" +"1:" + "movdqa (%0),%%xmm0\n" + "movdqa 0x10(%0),%%xmm1\n" + "movdqa 0x20(%0),%%xmm3\n" + "lea 0x30(%0),%0\n" + "movdqa %%xmm3,%%xmm2\n" + "palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] } + "pshufb %%xmm6,%%xmm2\n" + "por %%xmm7,%%xmm2\n" + "palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] } + "pshufb %%xmm6,%%xmm0\n" + "movdqa %%xmm2,0x20(%1)\n" + "por %%xmm7,%%xmm0\n" + "pshufb %%xmm6,%%xmm1\n" + "movdqa %%xmm0,(%1)\n" + "por %%xmm7,%%xmm1\n" + "palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] } + "pshufb %%xmm6,%%xmm3\n" + "movdqa %%xmm1,0x10(%1)\n" + "por %%xmm7,%%xmm3\n" + "movdqa %%xmm3,0x30(%1)\n" + "lea 0x40(%1),%1\n" + "sub $0x10,%2\n" + "ja 1b\n" + : "+r"(src_raw), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "r"(kShuffleMaskRAWToARGB) // %3 + : "memory" +); +} +#endif + +#if defined(__x86_64__) + +// 64 bit linux gcc version + +void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi + const uint8* u_buf, // rsi + const uint8* v_buf, // rdx + uint8* rgb_buf, // rcx + int width) { // r8 + asm volatile( +"1:" + "movzb (%1),%%r10\n" + "lea 1(%1),%1\n" + "movzb (%2),%%r11\n" + "lea 1(%2),%2\n" + "movq 2048(%5,%%r10,8),%%xmm0\n" + "movzb (%0),%%r10\n" + "movq 4096(%5,%%r11,8),%%xmm1\n" + "movzb 0x1(%0),%%r11\n" + "paddsw %%xmm1,%%xmm0\n" + "movq (%5,%%r10,8),%%xmm2\n" + "lea 2(%0),%0\n" + "movq (%5,%%r11,8),%%xmm3\n" + "paddsw %%xmm0,%%xmm2\n" + "paddsw %%xmm0,%%xmm3\n" + "shufps $0x44,%%xmm3,%%xmm2\n" + "psraw $0x6,%%xmm2\n" + "packuswb %%xmm2,%%xmm2\n" + "movq %%xmm2,0x0(%3)\n" + "lea 8(%3),%3\n" + "sub $0x2,%4\n" + "ja 1b\n" + : "+r"(y_buf), // %0 + "+r"(u_buf), // %1 + "+r"(v_buf), // %2 + "+r"(rgb_buf), // %3 + "+r"(width) // %4 + : "r" (_kCoefficientsRgbY) // %5 + : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" +); +} + +void FastConvertYUVToBGRARow(const uint8* y_buf, // rdi + const uint8* u_buf, // rsi + const uint8* v_buf, // rdx + uint8* rgb_buf, // rcx + int width) { // r8 + asm volatile( +"1:" + "movzb (%1),%%r10\n" + "lea 1(%1),%1\n" + "movzb (%2),%%r11\n" + "lea 1(%2),%2\n" + "movq 2048(%5,%%r10,8),%%xmm0\n" + "movzb (%0),%%r10\n" + "movq 4096(%5,%%r11,8),%%xmm1\n" + "movzb 0x1(%0),%%r11\n" + "paddsw %%xmm1,%%xmm0\n" + "movq (%5,%%r10,8),%%xmm2\n" + "lea 2(%0),%0\n" + "movq (%5,%%r11,8),%%xmm3\n" + "paddsw %%xmm0,%%xmm2\n" + "paddsw %%xmm0,%%xmm3\n" + "shufps $0x44,%%xmm3,%%xmm2\n" + "psraw $0x6,%%xmm2\n" + "packuswb %%xmm2,%%xmm2\n" + "movq %%xmm2,0x0(%3)\n" + "lea 8(%3),%3\n" + "sub $0x2,%4\n" + "ja 1b\n" + : "+r"(y_buf), // %0 + "+r"(u_buf), // %1 + "+r"(v_buf), // %2 + "+r"(rgb_buf), // %3 + "+r"(width) // %4 + : "r" (_kCoefficientsBgraY) // %5 + : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" +); +} + +void FastConvertYUVToABGRRow(const uint8* y_buf, // rdi + const uint8* u_buf, // rsi + const uint8* v_buf, // rdx + uint8* rgb_buf, // rcx + int width) { // r8 + asm volatile( +"1:" + "movzb (%1),%%r10\n" + "lea 1(%1),%1\n" + "movzb (%2),%%r11\n" + "lea 1(%2),%2\n" + "movq 2048(%5,%%r10,8),%%xmm0\n" + "movzb (%0),%%r10\n" + "movq 4096(%5,%%r11,8),%%xmm1\n" + "movzb 0x1(%0),%%r11\n" + "paddsw %%xmm1,%%xmm0\n" + "movq (%5,%%r10,8),%%xmm2\n" + "lea 2(%0),%0\n" + "movq (%5,%%r11,8),%%xmm3\n" + "paddsw %%xmm0,%%xmm2\n" + "paddsw %%xmm0,%%xmm3\n" + "shufps $0x44,%%xmm3,%%xmm2\n" + "psraw $0x6,%%xmm2\n" + "packuswb %%xmm2,%%xmm2\n" + "movq %%xmm2,0x0(%3)\n" + "lea 8(%3),%3\n" + "sub $0x2,%4\n" + "ja 1b\n" + : "+r"(y_buf), // %0 + "+r"(u_buf), // %1 + "+r"(v_buf), // %2 + "+r"(rgb_buf), // %3 + "+r"(width) // %4 + : "r" (_kCoefficientsAbgrY) // %5 + : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" +); +} + +void FastConvertYUV444ToRGB32Row(const uint8* y_buf, // rdi + const uint8* u_buf, // rsi + const uint8* v_buf, // rdx + uint8* rgb_buf, // rcx + int width) { // r8 + asm volatile( +"1:" + "movzb (%1),%%r10\n" + "lea 1(%1),%1\n" + "movzb (%2),%%r11\n" + "lea 1(%2),%2\n" + "movq 2048(%5,%%r10,8),%%xmm0\n" + "movzb (%0),%%r10\n" + "movq 4096(%5,%%r11,8),%%xmm1\n" + "paddsw %%xmm1,%%xmm0\n" + "movq (%5,%%r10,8),%%xmm2\n" + "lea 1(%0),%0\n" + "paddsw %%xmm0,%%xmm2\n" + "shufps $0x44,%%xmm2,%%xmm2\n" + "psraw $0x6,%%xmm2\n" + "packuswb %%xmm2,%%xmm2\n" + "movd %%xmm2,0x0(%3)\n" + "lea 4(%3),%3\n" + "sub $0x1,%4\n" + "ja 1b\n" + : "+r"(y_buf), // %0 + "+r"(u_buf), // %1 + "+r"(v_buf), // %2 + "+r"(rgb_buf), // %3 + "+r"(width) // %4 + : "r" (_kCoefficientsRgbY) // %5 + : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2" +); +} + +void FastConvertYToRGB32Row(const uint8* y_buf, // rdi + uint8* rgb_buf, // rcx + int width) { // r8 + asm volatile( +"1:" + "movzb (%0),%%r10\n" + "movzb 0x1(%0),%%r11\n" + "movq (%3,%%r10,8),%%xmm2\n" + "lea 2(%0),%0\n" + "movq (%3,%%r11,8),%%xmm3\n" + "shufps $0x44,%%xmm3,%%xmm2\n" + "psraw $0x6,%%xmm2\n" + "packuswb %%xmm2,%%xmm2\n" + "movq %%xmm2,0x0(%1)\n" + "lea 8(%1),%1\n" + "sub $0x2,%2\n" + "ja 1b\n" + : "+r"(y_buf), // %0 + "+r"(rgb_buf), // %1 + "+r"(width) // %2 + : "r" (_kCoefficientsRgbY) // %3 + : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" +); +} + +#elif defined(__i386__) +// 32 bit gcc version + +void FastConvertYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + asm( + ".text\n" +#if defined(OSX) || defined(IOS) + ".globl _FastConvertYUVToRGB32Row\n" +"_FastConvertYUVToRGB32Row:\n" +#else + ".global FastConvertYUVToRGB32Row\n" +"FastConvertYUVToRGB32Row:\n" +#endif + "pusha\n" + "mov 0x24(%esp),%edx\n" + "mov 0x28(%esp),%edi\n" + "mov 0x2c(%esp),%esi\n" + "mov 0x30(%esp),%ebp\n" + "mov 0x34(%esp),%ecx\n" + +"1:" + "movzbl (%edi),%eax\n" + "lea 1(%edi),%edi\n" + "movzbl (%esi),%ebx\n" + "lea 1(%esi),%esi\n" + "movq _kCoefficientsRgbY+2048(,%eax,8),%mm0\n" + "movzbl (%edx),%eax\n" + "paddsw _kCoefficientsRgbY+4096(,%ebx,8),%mm0\n" + "movzbl 0x1(%edx),%ebx\n" + "movq _kCoefficientsRgbY(,%eax,8),%mm1\n" + "lea 2(%edx),%edx\n" + "movq _kCoefficientsRgbY(,%ebx,8),%mm2\n" + "paddsw %mm0,%mm1\n" + "paddsw %mm0,%mm2\n" + "psraw $0x6,%mm1\n" + "psraw $0x6,%mm2\n" + "packuswb %mm2,%mm1\n" + "movntq %mm1,0x0(%ebp)\n" + "lea 8(%ebp),%ebp\n" + "sub $0x2,%ecx\n" + "ja 1b\n" + "popa\n" + "ret\n" +); + +void FastConvertYUVToBGRARow(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + asm( + ".text\n" +#if defined(OSX) || defined(IOS) + ".globl _FastConvertYUVToBGRARow\n" +"_FastConvertYUVToBGRARow:\n" +#else + ".global FastConvertYUVToBGRARow\n" +"FastConvertYUVToBGRARow:\n" +#endif + "pusha\n" + "mov 0x24(%esp),%edx\n" + "mov 0x28(%esp),%edi\n" + "mov 0x2c(%esp),%esi\n" + "mov 0x30(%esp),%ebp\n" + "mov 0x34(%esp),%ecx\n" + +"1:" + "movzbl (%edi),%eax\n" + "lea 1(%edi),%edi\n" + "movzbl (%esi),%ebx\n" + "lea 1(%esi),%esi\n" + "movq _kCoefficientsBgraY+2048(,%eax,8),%mm0\n" + "movzbl (%edx),%eax\n" + "paddsw _kCoefficientsBgraY+4096(,%ebx,8),%mm0\n" + "movzbl 0x1(%edx),%ebx\n" + "movq _kCoefficientsBgraY(,%eax,8),%mm1\n" + "lea 2(%edx),%edx\n" + "movq _kCoefficientsBgraY(,%ebx,8),%mm2\n" + "paddsw %mm0,%mm1\n" + "paddsw %mm0,%mm2\n" + "psraw $0x6,%mm1\n" + "psraw $0x6,%mm2\n" + "packuswb %mm2,%mm1\n" + "movntq %mm1,0x0(%ebp)\n" + "lea 8(%ebp),%ebp\n" + "sub $0x2,%ecx\n" + "ja 1b\n" + "popa\n" + "ret\n" +); + +void FastConvertYUVToABGRRow(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + asm( + ".text\n" +#if defined(OSX) || defined(IOS) + ".globl _FastConvertYUVToABGRRow\n" +"_FastConvertYUVToABGRRow:\n" +#else + ".global FastConvertYUVToABGRRow\n" +"FastConvertYUVToABGRRow:\n" +#endif + "pusha\n" + "mov 0x24(%esp),%edx\n" + "mov 0x28(%esp),%edi\n" + "mov 0x2c(%esp),%esi\n" + "mov 0x30(%esp),%ebp\n" + "mov 0x34(%esp),%ecx\n" + +"1:" + "movzbl (%edi),%eax\n" + "lea 1(%edi),%edi\n" + "movzbl (%esi),%ebx\n" + "lea 1(%esi),%esi\n" + "movq _kCoefficientsAbgrY+2048(,%eax,8),%mm0\n" + "movzbl (%edx),%eax\n" + "paddsw _kCoefficientsAbgrY+4096(,%ebx,8),%mm0\n" + "movzbl 0x1(%edx),%ebx\n" + "movq _kCoefficientsAbgrY(,%eax,8),%mm1\n" + "lea 2(%edx),%edx\n" + "movq _kCoefficientsAbgrY(,%ebx,8),%mm2\n" + "paddsw %mm0,%mm1\n" + "paddsw %mm0,%mm2\n" + "psraw $0x6,%mm1\n" + "psraw $0x6,%mm2\n" + "packuswb %mm2,%mm1\n" + "movntq %mm1,0x0(%ebp)\n" + "lea 8(%ebp),%ebp\n" + "sub $0x2,%ecx\n" + "ja 1b\n" + "popa\n" + "ret\n" +); + +void FastConvertYUV444ToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + asm( + ".text\n" +#if defined(OSX) || defined(IOS) + ".globl _FastConvertYUV444ToRGB32Row\n" +"_FastConvertYUV444ToRGB32Row:\n" +#else + ".global FastConvertYUV444ToRGB32Row\n" +"FastConvertYUV444ToRGB32Row:\n" +#endif + "pusha\n" + "mov 0x24(%esp),%edx\n" + "mov 0x28(%esp),%edi\n" + "mov 0x2c(%esp),%esi\n" + "mov 0x30(%esp),%ebp\n" + "mov 0x34(%esp),%ecx\n" + +"1:" + "movzbl (%edi),%eax\n" + "lea 1(%edi),%edi\n" + "movzbl (%esi),%ebx\n" + "lea 1(%esi),%esi\n" + "movq _kCoefficientsRgbY+2048(,%eax,8),%mm0\n" + "movzbl (%edx),%eax\n" + "paddsw _kCoefficientsRgbY+4096(,%ebx,8),%mm0\n" + "lea 1(%edx),%edx\n" + "paddsw _kCoefficientsRgbY(,%eax,8),%mm0\n" + "psraw $0x6,%mm0\n" + "packuswb %mm0,%mm0\n" + "movd %mm0,0x0(%ebp)\n" + "lea 4(%ebp),%ebp\n" + "sub $0x1,%ecx\n" + "ja 1b\n" + "popa\n" + "ret\n" +); + +void FastConvertYToRGB32Row(const uint8* y_buf, + uint8* rgb_buf, + int width); + asm( + ".text\n" +#if defined(OSX) || defined(IOS) + ".globl _FastConvertYToRGB32Row\n" +"_FastConvertYToRGB32Row:\n" +#else + ".global FastConvertYToRGB32Row\n" +"FastConvertYToRGB32Row:\n" +#endif + "push %ebx\n" + "mov 0x8(%esp),%eax\n" + "mov 0xc(%esp),%edx\n" + "mov 0x10(%esp),%ecx\n" + +"1:" + "movzbl (%eax),%ebx\n" + "movq _kCoefficientsRgbY(,%ebx,8),%mm0\n" + "psraw $0x6,%mm0\n" + "movzbl 0x1(%eax),%ebx\n" + "movq _kCoefficientsRgbY(,%ebx,8),%mm1\n" + "psraw $0x6,%mm1\n" + "packuswb %mm1,%mm0\n" + "lea 0x2(%eax),%eax\n" + "movq %mm0,(%edx)\n" + "lea 0x8(%edx),%edx\n" + "sub $0x2,%ecx\n" + "ja 1b\n" + "pop %ebx\n" + "ret\n" +); + +#else +// C reference code that mimic the YUV assembly. +#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x))) +#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \ + (((x) + (y)) > 32767 ? 32767 : ((x) + (y)))) + +static inline void YuvPixel(uint8 y, + uint8 u, + uint8 v, + uint8* rgb_buf, + int ashift, + int rshift, + int gshift, + int bshift) { + + int b = _kCoefficientsRgbY[256+u][0]; + int g = _kCoefficientsRgbY[256+u][1]; + int r = _kCoefficientsRgbY[256+u][2]; + int a = _kCoefficientsRgbY[256+u][3]; + + b = paddsw(b, _kCoefficientsRgbY[512+v][0]); + g = paddsw(g, _kCoefficientsRgbY[512+v][1]); + r = paddsw(r, _kCoefficientsRgbY[512+v][2]); + a = paddsw(a, _kCoefficientsRgbY[512+v][3]); + + b = paddsw(b, _kCoefficientsRgbY[y][0]); + g = paddsw(g, _kCoefficientsRgbY[y][1]); + r = paddsw(r, _kCoefficientsRgbY[y][2]); + a = paddsw(a, _kCoefficientsRgbY[y][3]); + + b >>= 6; + g >>= 6; + r >>= 6; + a >>= 6; + + *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b) << bshift) | + (packuswb(g) << gshift) | + (packuswb(r) << rshift) | + (packuswb(a) << ashift); +} + +void FastConvertYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + for (int x = 0; x < width; x += 2) { + uint8 u = u_buf[x >> 1]; + uint8 v = v_buf[x >> 1]; + uint8 y0 = y_buf[x]; + YuvPixel(y0, u, v, rgb_buf, 24, 16, 8, 0); + if ((x + 1) < width) { + uint8 y1 = y_buf[x + 1]; + YuvPixel(y1, u, v, rgb_buf + 4, 24, 16, 8, 0); + } + rgb_buf += 8; // Advance 2 pixels. + } +} + +void FastConvertYUVToBGRARow(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + for (int x = 0; x < width; x += 2) { + uint8 u = u_buf[x >> 1]; + uint8 v = v_buf[x >> 1]; + uint8 y0 = y_buf[x]; + YuvPixel(y0, u, v, rgb_buf, 0, 8, 16, 24); + if ((x + 1) < width) { + uint8 y1 = y_buf[x + 1]; + YuvPixel(y1, u, v, rgb_buf + 4, 0, 8, 16, 24); + } + rgb_buf += 8; // Advance 2 pixels. + } +} + +void FastConvertYUVToABGRRow(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + for (int x = 0; x < width; x += 2) { + uint8 u = u_buf[x >> 1]; + uint8 v = v_buf[x >> 1]; + uint8 y0 = y_buf[x]; + YuvPixel(y0, u, v, rgb_buf, 24, 0, 8, 16); + if ((x + 1) < width) { + uint8 y1 = y_buf[x + 1]; + YuvPixel(y1, u, v, rgb_buf + 4, 24, 0, 8, 16); + } + rgb_buf += 8; // Advance 2 pixels. + } +} + +void FastConvertYUV444ToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + for (int x = 0; x < width; ++x) { + uint8 u = u_buf[x]; + uint8 v = v_buf[x]; + uint8 y = y_buf[x]; + YuvPixel(y, u, v, rgb_buf, 24, 16, 8, 0); + rgb_buf += 4; // Advance 1 pixel. + } +} + +void FastConvertYToRGB32Row(const uint8* y_buf, + uint8* rgb_buf, + int width) { + for (int x = 0; x < width; ++x) { + uint8 y = y_buf[x]; + YuvPixel(y, 128, 128, rgb_buf, 24, 16, 8, 0); + rgb_buf += 4; // Advance 1 pixel. + } +} + +#endif + +} // extern "C" diff --git a/files/source/row_table.cc b/files/source/row_table.cc new file mode 100644 index 00000000..022d9f88 --- /dev/null +++ b/files/source/row_table.cc @@ -0,0 +1,469 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "row.h" + +#define kMaxStride (2048 * 4) + +extern "C" { + +#define MAKETABLE(NAME) \ +SIMD_ALIGNED(const int16 NAME[256 * 3][4]) = {\ + RGBY(0x00), RGBY(0x01), RGBY(0x02), RGBY(0x03), \ + RGBY(0x04), RGBY(0x05), RGBY(0x06), RGBY(0x07), \ + RGBY(0x08), RGBY(0x09), RGBY(0x0A), RGBY(0x0B), \ + RGBY(0x0C), RGBY(0x0D), RGBY(0x0E), RGBY(0x0F), \ + RGBY(0x10), RGBY(0x11), RGBY(0x12), RGBY(0x13), \ + RGBY(0x14), RGBY(0x15), RGBY(0x16), RGBY(0x17), \ + RGBY(0x18), RGBY(0x19), RGBY(0x1A), RGBY(0x1B), \ + RGBY(0x1C), RGBY(0x1D), RGBY(0x1E), RGBY(0x1F), \ + RGBY(0x20), RGBY(0x21), RGBY(0x22), RGBY(0x23), \ + RGBY(0x24), RGBY(0x25), RGBY(0x26), RGBY(0x27), \ + RGBY(0x28), RGBY(0x29), RGBY(0x2A), RGBY(0x2B), \ + RGBY(0x2C), RGBY(0x2D), RGBY(0x2E), RGBY(0x2F), \ + RGBY(0x30), RGBY(0x31), RGBY(0x32), RGBY(0x33), \ + RGBY(0x34), RGBY(0x35), RGBY(0x36), RGBY(0x37), \ + RGBY(0x38), RGBY(0x39), RGBY(0x3A), RGBY(0x3B), \ + RGBY(0x3C), RGBY(0x3D), RGBY(0x3E), RGBY(0x3F), \ + RGBY(0x40), RGBY(0x41), RGBY(0x42), RGBY(0x43), \ + RGBY(0x44), RGBY(0x45), RGBY(0x46), RGBY(0x47), \ + RGBY(0x48), RGBY(0x49), RGBY(0x4A), RGBY(0x4B), \ + RGBY(0x4C), RGBY(0x4D), RGBY(0x4E), RGBY(0x4F), \ + RGBY(0x50), RGBY(0x51), RGBY(0x52), RGBY(0x53), \ + RGBY(0x54), RGBY(0x55), RGBY(0x56), RGBY(0x57), \ + RGBY(0x58), RGBY(0x59), RGBY(0x5A), RGBY(0x5B), \ + RGBY(0x5C), RGBY(0x5D), RGBY(0x5E), RGBY(0x5F), \ + RGBY(0x60), RGBY(0x61), RGBY(0x62), RGBY(0x63), \ + RGBY(0x64), RGBY(0x65), RGBY(0x66), RGBY(0x67), \ + RGBY(0x68), RGBY(0x69), RGBY(0x6A), RGBY(0x6B), \ + RGBY(0x6C), RGBY(0x6D), RGBY(0x6E), RGBY(0x6F), \ + RGBY(0x70), RGBY(0x71), RGBY(0x72), RGBY(0x73), \ + RGBY(0x74), RGBY(0x75), RGBY(0x76), RGBY(0x77), \ + RGBY(0x78), RGBY(0x79), RGBY(0x7A), RGBY(0x7B), \ + RGBY(0x7C), RGBY(0x7D), RGBY(0x7E), RGBY(0x7F), \ + RGBY(0x80), RGBY(0x81), RGBY(0x82), RGBY(0x83), \ + RGBY(0x84), RGBY(0x85), RGBY(0x86), RGBY(0x87), \ + RGBY(0x88), RGBY(0x89), RGBY(0x8A), RGBY(0x8B), \ + RGBY(0x8C), RGBY(0x8D), RGBY(0x8E), RGBY(0x8F), \ + RGBY(0x90), RGBY(0x91), RGBY(0x92), RGBY(0x93), \ + RGBY(0x94), RGBY(0x95), RGBY(0x96), RGBY(0x97), \ + RGBY(0x98), RGBY(0x99), RGBY(0x9A), RGBY(0x9B), \ + RGBY(0x9C), RGBY(0x9D), RGBY(0x9E), RGBY(0x9F), \ + RGBY(0xA0), RGBY(0xA1), RGBY(0xA2), RGBY(0xA3), \ + RGBY(0xA4), RGBY(0xA5), RGBY(0xA6), RGBY(0xA7), \ + RGBY(0xA8), RGBY(0xA9), RGBY(0xAA), RGBY(0xAB), \ + RGBY(0xAC), RGBY(0xAD), RGBY(0xAE), RGBY(0xAF), \ + RGBY(0xB0), RGBY(0xB1), RGBY(0xB2), RGBY(0xB3), \ + RGBY(0xB4), RGBY(0xB5), RGBY(0xB6), RGBY(0xB7), \ + RGBY(0xB8), RGBY(0xB9), RGBY(0xBA), RGBY(0xBB), \ + RGBY(0xBC), RGBY(0xBD), RGBY(0xBE), RGBY(0xBF), \ + RGBY(0xC0), RGBY(0xC1), RGBY(0xC2), RGBY(0xC3), \ + RGBY(0xC4), RGBY(0xC5), RGBY(0xC6), RGBY(0xC7), \ + RGBY(0xC8), RGBY(0xC9), RGBY(0xCA), RGBY(0xCB), \ + RGBY(0xCC), RGBY(0xCD), RGBY(0xCE), RGBY(0xCF), \ + RGBY(0xD0), RGBY(0xD1), RGBY(0xD2), RGBY(0xD3), \ + RGBY(0xD4), RGBY(0xD5), RGBY(0xD6), RGBY(0xD7), \ + RGBY(0xD8), RGBY(0xD9), RGBY(0xDA), RGBY(0xDB), \ + RGBY(0xDC), RGBY(0xDD), RGBY(0xDE), RGBY(0xDF), \ + RGBY(0xE0), RGBY(0xE1), RGBY(0xE2), RGBY(0xE3), \ + RGBY(0xE4), RGBY(0xE5), RGBY(0xE6), RGBY(0xE7), \ + RGBY(0xE8), RGBY(0xE9), RGBY(0xEA), RGBY(0xEB), \ + RGBY(0xEC), RGBY(0xED), RGBY(0xEE), RGBY(0xEF), \ + RGBY(0xF0), RGBY(0xF1), RGBY(0xF2), RGBY(0xF3), \ + RGBY(0xF4), RGBY(0xF5), RGBY(0xF6), RGBY(0xF7), \ + RGBY(0xF8), RGBY(0xF9), RGBY(0xFA), RGBY(0xFB), \ + RGBY(0xFC), RGBY(0xFD), RGBY(0xFE), RGBY(0xFF), \ + RGBU(0x00), RGBU(0x01), RGBU(0x02), RGBU(0x03), \ + RGBU(0x04), RGBU(0x05), RGBU(0x06), RGBU(0x07), \ + RGBU(0x08), RGBU(0x09), RGBU(0x0A), RGBU(0x0B), \ + RGBU(0x0C), RGBU(0x0D), RGBU(0x0E), RGBU(0x0F), \ + RGBU(0x10), RGBU(0x11), RGBU(0x12), RGBU(0x13), \ + RGBU(0x14), RGBU(0x15), RGBU(0x16), RGBU(0x17), \ + RGBU(0x18), RGBU(0x19), RGBU(0x1A), RGBU(0x1B), \ + RGBU(0x1C), RGBU(0x1D), RGBU(0x1E), RGBU(0x1F), \ + RGBU(0x20), RGBU(0x21), RGBU(0x22), RGBU(0x23), \ + RGBU(0x24), RGBU(0x25), RGBU(0x26), RGBU(0x27), \ + RGBU(0x28), RGBU(0x29), RGBU(0x2A), RGBU(0x2B), \ + RGBU(0x2C), RGBU(0x2D), RGBU(0x2E), RGBU(0x2F), \ + RGBU(0x30), RGBU(0x31), RGBU(0x32), RGBU(0x33), \ + RGBU(0x34), RGBU(0x35), RGBU(0x36), RGBU(0x37), \ + RGBU(0x38), RGBU(0x39), RGBU(0x3A), RGBU(0x3B), \ + RGBU(0x3C), RGBU(0x3D), RGBU(0x3E), RGBU(0x3F), \ + RGBU(0x40), RGBU(0x41), RGBU(0x42), RGBU(0x43), \ + RGBU(0x44), RGBU(0x45), RGBU(0x46), RGBU(0x47), \ + RGBU(0x48), RGBU(0x49), RGBU(0x4A), RGBU(0x4B), \ + RGBU(0x4C), RGBU(0x4D), RGBU(0x4E), RGBU(0x4F), \ + RGBU(0x50), RGBU(0x51), RGBU(0x52), RGBU(0x53), \ + RGBU(0x54), RGBU(0x55), RGBU(0x56), RGBU(0x57), \ + RGBU(0x58), RGBU(0x59), RGBU(0x5A), RGBU(0x5B), \ + RGBU(0x5C), RGBU(0x5D), RGBU(0x5E), RGBU(0x5F), \ + RGBU(0x60), RGBU(0x61), RGBU(0x62), RGBU(0x63), \ + RGBU(0x64), RGBU(0x65), RGBU(0x66), RGBU(0x67), \ + RGBU(0x68), RGBU(0x69), RGBU(0x6A), RGBU(0x6B), \ + RGBU(0x6C), RGBU(0x6D), RGBU(0x6E), RGBU(0x6F), \ + RGBU(0x70), RGBU(0x71), RGBU(0x72), RGBU(0x73), \ + RGBU(0x74), RGBU(0x75), RGBU(0x76), RGBU(0x77), \ + RGBU(0x78), RGBU(0x79), RGBU(0x7A), RGBU(0x7B), \ + RGBU(0x7C), RGBU(0x7D), RGBU(0x7E), RGBU(0x7F), \ + RGBU(0x80), RGBU(0x81), RGBU(0x82), RGBU(0x83), \ + RGBU(0x84), RGBU(0x85), RGBU(0x86), RGBU(0x87), \ + RGBU(0x88), RGBU(0x89), RGBU(0x8A), RGBU(0x8B), \ + RGBU(0x8C), RGBU(0x8D), RGBU(0x8E), RGBU(0x8F), \ + RGBU(0x90), RGBU(0x91), RGBU(0x92), RGBU(0x93), \ + RGBU(0x94), RGBU(0x95), RGBU(0x96), RGBU(0x97), \ + RGBU(0x98), RGBU(0x99), RGBU(0x9A), RGBU(0x9B), \ + RGBU(0x9C), RGBU(0x9D), RGBU(0x9E), RGBU(0x9F), \ + RGBU(0xA0), RGBU(0xA1), RGBU(0xA2), RGBU(0xA3), \ + RGBU(0xA4), RGBU(0xA5), RGBU(0xA6), RGBU(0xA7), \ + RGBU(0xA8), RGBU(0xA9), RGBU(0xAA), RGBU(0xAB), \ + RGBU(0xAC), RGBU(0xAD), RGBU(0xAE), RGBU(0xAF), \ + RGBU(0xB0), RGBU(0xB1), RGBU(0xB2), RGBU(0xB3), \ + RGBU(0xB4), RGBU(0xB5), RGBU(0xB6), RGBU(0xB7), \ + RGBU(0xB8), RGBU(0xB9), RGBU(0xBA), RGBU(0xBB), \ + RGBU(0xBC), RGBU(0xBD), RGBU(0xBE), RGBU(0xBF), \ + RGBU(0xC0), RGBU(0xC1), RGBU(0xC2), RGBU(0xC3), \ + RGBU(0xC4), RGBU(0xC5), RGBU(0xC6), RGBU(0xC7), \ + RGBU(0xC8), RGBU(0xC9), RGBU(0xCA), RGBU(0xCB), \ + RGBU(0xCC), RGBU(0xCD), RGBU(0xCE), RGBU(0xCF), \ + RGBU(0xD0), RGBU(0xD1), RGBU(0xD2), RGBU(0xD3), \ + RGBU(0xD4), RGBU(0xD5), RGBU(0xD6), RGBU(0xD7), \ + RGBU(0xD8), RGBU(0xD9), RGBU(0xDA), RGBU(0xDB), \ + RGBU(0xDC), RGBU(0xDD), RGBU(0xDE), RGBU(0xDF), \ + RGBU(0xE0), RGBU(0xE1), RGBU(0xE2), RGBU(0xE3), \ + RGBU(0xE4), RGBU(0xE5), RGBU(0xE6), RGBU(0xE7), \ + RGBU(0xE8), RGBU(0xE9), RGBU(0xEA), RGBU(0xEB), \ + RGBU(0xEC), RGBU(0xED), RGBU(0xEE), RGBU(0xEF), \ + RGBU(0xF0), RGBU(0xF1), RGBU(0xF2), RGBU(0xF3), \ + RGBU(0xF4), RGBU(0xF5), RGBU(0xF6), RGBU(0xF7), \ + RGBU(0xF8), RGBU(0xF9), RGBU(0xFA), RGBU(0xFB), \ + RGBU(0xFC), RGBU(0xFD), RGBU(0xFE), RGBU(0xFF), \ + RGBV(0x00), RGBV(0x01), RGBV(0x02), RGBV(0x03), \ + RGBV(0x04), RGBV(0x05), RGBV(0x06), RGBV(0x07), \ + RGBV(0x08), RGBV(0x09), RGBV(0x0A), RGBV(0x0B), \ + RGBV(0x0C), RGBV(0x0D), RGBV(0x0E), RGBV(0x0F), \ + RGBV(0x10), RGBV(0x11), RGBV(0x12), RGBV(0x13), \ + RGBV(0x14), RGBV(0x15), RGBV(0x16), RGBV(0x17), \ + RGBV(0x18), RGBV(0x19), RGBV(0x1A), RGBV(0x1B), \ + RGBV(0x1C), RGBV(0x1D), RGBV(0x1E), RGBV(0x1F), \ + RGBV(0x20), RGBV(0x21), RGBV(0x22), RGBV(0x23), \ + RGBV(0x24), RGBV(0x25), RGBV(0x26), RGBV(0x27), \ + RGBV(0x28), RGBV(0x29), RGBV(0x2A), RGBV(0x2B), \ + RGBV(0x2C), RGBV(0x2D), RGBV(0x2E), RGBV(0x2F), \ + RGBV(0x30), RGBV(0x31), RGBV(0x32), RGBV(0x33), \ + RGBV(0x34), RGBV(0x35), RGBV(0x36), RGBV(0x37), \ + RGBV(0x38), RGBV(0x39), RGBV(0x3A), RGBV(0x3B), \ + RGBV(0x3C), RGBV(0x3D), RGBV(0x3E), RGBV(0x3F), \ + RGBV(0x40), RGBV(0x41), RGBV(0x42), RGBV(0x43), \ + RGBV(0x44), RGBV(0x45), RGBV(0x46), RGBV(0x47), \ + RGBV(0x48), RGBV(0x49), RGBV(0x4A), RGBV(0x4B), \ + RGBV(0x4C), RGBV(0x4D), RGBV(0x4E), RGBV(0x4F), \ + RGBV(0x50), RGBV(0x51), RGBV(0x52), RGBV(0x53), \ + RGBV(0x54), RGBV(0x55), RGBV(0x56), RGBV(0x57), \ + RGBV(0x58), RGBV(0x59), RGBV(0x5A), RGBV(0x5B), \ + RGBV(0x5C), RGBV(0x5D), RGBV(0x5E), RGBV(0x5F), \ + RGBV(0x60), RGBV(0x61), RGBV(0x62), RGBV(0x63), \ + RGBV(0x64), RGBV(0x65), RGBV(0x66), RGBV(0x67), \ + RGBV(0x68), RGBV(0x69), RGBV(0x6A), RGBV(0x6B), \ + RGBV(0x6C), RGBV(0x6D), RGBV(0x6E), RGBV(0x6F), \ + RGBV(0x70), RGBV(0x71), RGBV(0x72), RGBV(0x73), \ + RGBV(0x74), RGBV(0x75), RGBV(0x76), RGBV(0x77), \ + RGBV(0x78), RGBV(0x79), RGBV(0x7A), RGBV(0x7B), \ + RGBV(0x7C), RGBV(0x7D), RGBV(0x7E), RGBV(0x7F), \ + RGBV(0x80), RGBV(0x81), RGBV(0x82), RGBV(0x83), \ + RGBV(0x84), RGBV(0x85), RGBV(0x86), RGBV(0x87), \ + RGBV(0x88), RGBV(0x89), RGBV(0x8A), RGBV(0x8B), \ + RGBV(0x8C), RGBV(0x8D), RGBV(0x8E), RGBV(0x8F), \ + RGBV(0x90), RGBV(0x91), RGBV(0x92), RGBV(0x93), \ + RGBV(0x94), RGBV(0x95), RGBV(0x96), RGBV(0x97), \ + RGBV(0x98), RGBV(0x99), RGBV(0x9A), RGBV(0x9B), \ + RGBV(0x9C), RGBV(0x9D), RGBV(0x9E), RGBV(0x9F), \ + RGBV(0xA0), RGBV(0xA1), RGBV(0xA2), RGBV(0xA3), \ + RGBV(0xA4), RGBV(0xA5), RGBV(0xA6), RGBV(0xA7), \ + RGBV(0xA8), RGBV(0xA9), RGBV(0xAA), RGBV(0xAB), \ + RGBV(0xAC), RGBV(0xAD), RGBV(0xAE), RGBV(0xAF), \ + RGBV(0xB0), RGBV(0xB1), RGBV(0xB2), RGBV(0xB3), \ + RGBV(0xB4), RGBV(0xB5), RGBV(0xB6), RGBV(0xB7), \ + RGBV(0xB8), RGBV(0xB9), RGBV(0xBA), RGBV(0xBB), \ + RGBV(0xBC), RGBV(0xBD), RGBV(0xBE), RGBV(0xBF), \ + RGBV(0xC0), RGBV(0xC1), RGBV(0xC2), RGBV(0xC3), \ + RGBV(0xC4), RGBV(0xC5), RGBV(0xC6), RGBV(0xC7), \ + RGBV(0xC8), RGBV(0xC9), RGBV(0xCA), RGBV(0xCB), \ + RGBV(0xCC), RGBV(0xCD), RGBV(0xCE), RGBV(0xCF), \ + RGBV(0xD0), RGBV(0xD1), RGBV(0xD2), RGBV(0xD3), \ + RGBV(0xD4), RGBV(0xD5), RGBV(0xD6), RGBV(0xD7), \ + RGBV(0xD8), RGBV(0xD9), RGBV(0xDA), RGBV(0xDB), \ + RGBV(0xDC), RGBV(0xDD), RGBV(0xDE), RGBV(0xDF), \ + RGBV(0xE0), RGBV(0xE1), RGBV(0xE2), RGBV(0xE3), \ + RGBV(0xE4), RGBV(0xE5), RGBV(0xE6), RGBV(0xE7), \ + RGBV(0xE8), RGBV(0xE9), RGBV(0xEA), RGBV(0xEB), \ + RGBV(0xEC), RGBV(0xED), RGBV(0xEE), RGBV(0xEF), \ + RGBV(0xF0), RGBV(0xF1), RGBV(0xF2), RGBV(0xF3), \ + RGBV(0xF4), RGBV(0xF5), RGBV(0xF6), RGBV(0xF7), \ + RGBV(0xF8), RGBV(0xF9), RGBV(0xFA), RGBV(0xFB), \ + RGBV(0xFC), RGBV(0xFD), RGBV(0xFE), RGBV(0xFF), \ +}; + +// ARGB table +#define RGBY(i) { \ + static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ + static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ + static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ + static_cast<int16>(256 * 64 - 1) \ +} + +#define RGBU(i) { \ + static_cast<int16>(2.018 * 64 * (i - 128) + 0.5), \ + static_cast<int16>(-0.391 * 64 * (i - 128) + 0.5), \ + 0, \ + 0 \ +} + +#define RGBV(i) { \ + 0, \ + static_cast<int16>(-0.813 * 64 * (i - 128) + 0.5), \ + static_cast<int16>(1.596 * 64 * (i - 128) + 0.5), \ + 0 \ +} + +#ifdef OSX +MAKETABLE(kCoefficientsRgbY) +#else +MAKETABLE(_kCoefficientsRgbY) +#endif + +#undef RGBY +#undef RGBU +#undef RGBV + +// BGRA table +#define RGBY(i) { \ + static_cast<int16>(256 * 64 - 1), \ + static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ + static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ + static_cast<int16>(1.164 * 64 * (i - 16) + 0.5) \ +} + +#define RGBU(i) { \ + 0, \ + 0, \ + static_cast<int16>(-0.391 * 64 * (i - 128) + 0.5), \ + static_cast<int16>(2.018 * 64 * (i - 128) + 0.5) \ +} + +#define RGBV(i) { \ + 0, \ + static_cast<int16>(1.596 * 64 * (i - 128) + 0.5), \ + static_cast<int16>(-0.813 * 64 * (i - 128) + 0.5), \ + 0 \ +} + +#ifdef OSX +MAKETABLE(kCoefficientsBgraY) +#else +MAKETABLE(_kCoefficientsBgraY) +#endif + + +#undef RGBY +#undef RGBU +#undef RGBV + +// ABGR table +#define RGBY(i) { \ + static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ + static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ + static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ + static_cast<int16>(256 * 64 - 1) \ +} + +#define RGBU(i) { \ + 0, \ + static_cast<int16>(-0.391 * 64 * (i - 128) + 0.5), \ + static_cast<int16>(2.018 * 64 * (i - 128) + 0.5), \ + 0 \ +} + +#define RGBV(i) { \ + static_cast<int16>(1.596 * 64 * (i - 128) + 0.5), \ + static_cast<int16>(-0.813 * 64 * (i - 128) + 0.5), \ + 0, \ + 0 \ +} + +#ifdef OSX +MAKETABLE(kCoefficientsAbgrY) +#else +MAKETABLE(_kCoefficientsAbgrY) +#endif + + +void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) { + for (int x = 0; x < pix; ++x) { + uint8 r = src_raw[0]; + uint8 g = src_raw[1]; + uint8 b = src_raw[2]; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = 255u; + dst_argb += 4; + src_raw += 3; + } +} + +void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix) { + for (int x = 0; x < pix; ++x) { + uint8 b = src_bg24[0]; + uint8 g = src_bg24[1]; + uint8 r = src_bg24[2]; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = 255u; + dst_argb[3] = 255u; + dst_argb += 4; + src_bg24 += 3; + } +} + +// C versions do the same +void RGB24ToYRow_C(const uint8* src_argb, uint8* dst_y, int pix) { + SIMD_ALIGNED(uint8 row[kMaxStride]); + BG24ToARGBRow_C(src_argb, row, pix); + ARGBToYRow_C(row, dst_y, pix); +} + +void RAWToYRow_C(const uint8* src_argb, uint8* dst_y, int pix) { + SIMD_ALIGNED(uint8 row[kMaxStride]); + RAWToARGBRow_C(src_argb, row, pix); + ARGBToYRow_C(row, dst_y, pix); +} + +void RGB24ToUVRow_C(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix) { + SIMD_ALIGNED(uint8 row[kMaxStride * 2]); + BG24ToARGBRow_C(src_argb, row, pix); + BG24ToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix); + ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix); +} + +void RAWToUVRow_C(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix) { + SIMD_ALIGNED(uint8 row[kMaxStride * 2]); + RAWToARGBRow_C(src_argb, row, pix); + RAWToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix); + ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix); +} + +static inline int RGBToY(uint8 r, uint8 g, uint8 b) { + return (( 66 * r + 129 * g + 25 * b + 128) >> 8) + 16; +} + +static inline int RGBToU(uint8 r, uint8 g, uint8 b) { + return ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; +} +static inline int RGBToV(uint8 r, uint8 g, uint8 b) { + return ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; +} + +#define MAKEROWY(NAME,R,G,B) \ +void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \ + for (int x = 0; x < width; ++x) { \ + dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \ + src_argb0 += 4; \ + dst_y += 1; \ + } \ +} \ +void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb, \ + uint8* dst_u, uint8* dst_v, int width) { \ + const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \ + for (int x = 0; x < width - 1; x += 2) { \ + uint8 ab = (src_rgb0[B] + src_rgb0[B + 4] + \ + src_rgb1[B] + src_rgb1[B + 4]) >> 2; \ + uint8 ag = (src_rgb0[G] + src_rgb0[G + 4] + \ + src_rgb1[G] + src_rgb1[G + 4]) >> 2; \ + uint8 ar = (src_rgb0[R] + src_rgb0[R + 4] + \ + src_rgb1[R] + src_rgb1[R + 4]) >> 2; \ + dst_u[0] = RGBToU(ar, ag, ab); \ + dst_v[0] = RGBToV(ar, ag, ab); \ + src_rgb0 += 8; \ + src_rgb1 += 8; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \ + uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \ + uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \ + dst_u[0] = RGBToU(ar, ag, ab); \ + dst_v[0] = RGBToV(ar, ag, ab); \ + } \ +} + +MAKEROWY(ARGB,2,1,0) +MAKEROWY(BGRA,1,2,3) +MAKEROWY(ABGR,0,1,2) + +#if defined(HAS_RAWTOYROW_SSSE3) + +void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + SIMD_ALIGNED(uint8 row[kMaxStride]); + BG24ToARGBRow_SSSE3(src_argb, row, pix); + ARGBToYRow_SSSE3(row, dst_y, pix); +} + +void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + SIMD_ALIGNED(uint8 row[kMaxStride]); + RAWToARGBRow_SSSE3(src_argb, row, pix); + ARGBToYRow_SSSE3(row, dst_y, pix); +} + +#endif + +#if defined(HAS_RAWTOUVROW_SSSE3) +#if defined(HAS_ARGBTOUVROW_SSSE3) +void RGB24ToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix) { + SIMD_ALIGNED(uint8 row[kMaxStride * 2]); + BG24ToARGBRow_SSSE3(src_argb, row, pix); + BG24ToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix); + ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix); +} + +void RAWToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix) { + SIMD_ALIGNED(uint8 row[kMaxStride * 2]); + RAWToARGBRow_SSSE3(src_argb, row, pix); + RAWToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix); + ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix); +} + +#else + +void RGB24ToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix) { + SIMD_ALIGNED(uint8 row[kMaxStride * 2]); + BG24ToARGBRow_SSSE3(src_argb, row, pix); + BG24ToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix); + ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix); +} + +void RAWToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix) { + SIMD_ALIGNED(uint8 row[kMaxStride * 2]); + RAWToARGBRow_SSSE3(src_argb, row, pix); + RAWToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix); + ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix); +} + +#endif +#endif + +} // extern "C" diff --git a/files/source/row_win.cc b/files/source/row_win.cc new file mode 100644 index 00000000..2bc5fb13 --- /dev/null +++ b/files/source/row_win.cc @@ -0,0 +1,636 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "row.h" + +extern "C" { + +#ifdef HAS_ARGBTOYROW_SSSE3 +#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var + +// Constant multiplication table for converting ARGB to I400. +extern "C" TALIGN16(const int8, kARGBToY[16]) = { + 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 +}; + +extern "C" TALIGN16(const int8, kARGBToU[16]) = { + 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 +}; + +extern "C" TALIGN16(const int8, kARGBToV[16]) = { + -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, +}; + +// Constants for BGRA +extern "C" TALIGN16(const int8, kBGRAToY[16]) = { + 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 +}; + +extern "C" TALIGN16(const int8, kBGRAToU[16]) = { + 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 +}; + +extern "C" TALIGN16(const int8, kBGRAToV[16]) = { + 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 +}; + +// Constants for ABGR +extern "C" TALIGN16(const int8, kABGRToY[16]) = { + 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 +}; + +extern "C" TALIGN16(const int8, kABGRToU[16]) = { + -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 +}; + +extern "C" TALIGN16(const int8, kABGRToV[16]) = { + 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 +}; + +extern "C" TALIGN16(const uint8, kAddY16[16]) = { + 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, + 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, +}; + +extern "C" TALIGN16(const uint8, kAddUV128[16]) = { + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u +}; + +// Shuffle table for converting BG24 to ARGB. +extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = { + 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u +}; + +// Shuffle table for converting RAW to ARGB. +extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = { + 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u +}; + +// Convert 16 ARGB pixels (64 bytes) to 16 Y values +__declspec(naked) +void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { +__asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm7, _kARGBToY + movdqa xmm6, _kAddY16 + + convertloop : + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pmaddubsw xmm0, xmm7 + pmaddubsw xmm1, xmm7 + pmaddubsw xmm2, xmm7 + pmaddubsw xmm3, xmm7 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm6 + movdqa [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + ja convertloop + ret + } +} + +__declspec(naked) +void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { +__asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm7, _kBGRAToY + movdqa xmm6, _kAddY16 + + convertloop : + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pmaddubsw xmm0, xmm7 + pmaddubsw xmm1, xmm7 + pmaddubsw xmm2, xmm7 + pmaddubsw xmm3, xmm7 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm6 + movdqa [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + ja convertloop + ret + } +} + +__declspec(naked) +void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { +__asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm7, _kABGRToY + movdqa xmm6, _kAddY16 + + convertloop : + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pmaddubsw xmm0, xmm7 + pmaddubsw xmm1, xmm7 + pmaddubsw xmm2, xmm7 + pmaddubsw xmm3, xmm7 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm6 + movdqa [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + ja convertloop + ret + } +} + +__declspec(naked) +void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { +__asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, _kARGBToU + movdqa xmm6, _kARGBToV + movdqa xmm5, _kAddUV128 + sub edi, edx // stride from u to v + + convertloop : + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pavgb xmm0, [eax + esi] + pavgb xmm1, [eax + esi + 16] + pavgb xmm2, [eax + esi + 32] + pavgb xmm3, [eax + esi + 48] + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + sub ecx, 16 + ja convertloop + pop edi + pop esi + ret + } +} + +__declspec(naked) +void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { +__asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, _kBGRAToU + movdqa xmm6, _kBGRAToV + movdqa xmm5, _kAddUV128 + sub edi, edx // stride from u to v + + convertloop : + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pavgb xmm0, [eax + esi] + pavgb xmm1, [eax + esi + 16] + pavgb xmm2, [eax + esi + 32] + pavgb xmm3, [eax + esi + 48] + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + sub ecx, 16 + ja convertloop + pop edi + pop esi + ret + } +} + +__declspec(naked) +void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { +__asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, _kABGRToU + movdqa xmm6, _kABGRToV + movdqa xmm5, _kAddUV128 + sub edi, edx // stride from u to v + + convertloop : + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pavgb xmm0, [eax + esi] + pavgb xmm1, [eax + esi + 16] + pavgb xmm2, [eax + esi + 32] + pavgb xmm3, [eax + esi + 48] + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + sub ecx, 16 + ja convertloop + pop edi + pop esi + ret + } +} + +__declspec(naked) +void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) { +__asm { + mov eax, [esp + 4] // src_bg24 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + pcmpeqb xmm7, xmm7 // generate mask 0xff000000 + pslld xmm7, 24 + movdqa xmm6, _kShuffleMaskBG24ToARGB + + convertloop : + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm3, [eax + 32] + lea eax, [eax + 48] + movdqa xmm2, xmm3 + palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} + pshufb xmm2, xmm6 + por xmm2, xmm7 + palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} + pshufb xmm0, xmm6 + movdqa [edx + 32], xmm2 + por xmm0, xmm7 + pshufb xmm1, xmm6 + movdqa [edx], xmm0 + por xmm1, xmm7 + palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} + pshufb xmm3, xmm6 + movdqa [edx + 16], xmm1 + por xmm3, xmm7 + movdqa [edx + 48], xmm3 + lea edx, [edx + 64] + sub ecx, 16 + ja convertloop + ret + } +} + +__declspec(naked) +void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, + int pix) { +__asm { + mov eax, [esp + 4] // src_raw + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + pcmpeqb xmm7, xmm7 // generate mask 0xff000000 + pslld xmm7, 24 + movdqa xmm6, _kShuffleMaskRAWToARGB + + convertloop : + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm3, [eax + 32] + lea eax, [eax + 48] + movdqa xmm2, xmm3 + palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} + pshufb xmm2, xmm6 + por xmm2, xmm7 + palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} + pshufb xmm0, xmm6 + movdqa [edx + 32], xmm2 + por xmm0, xmm7 + pshufb xmm1, xmm6 + movdqa [edx], xmm0 + por xmm1, xmm7 + palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} + pshufb xmm3, xmm6 + movdqa [edx + 16], xmm1 + por xmm3, xmm7 + movdqa [edx + 48], xmm3 + lea edx, [edx + 64] + sub ecx, 16 + ja convertloop + ret + } +} + +__declspec(naked) +void FastConvertYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + __asm { + pushad + mov edx, [esp + 32 + 4] + mov edi, [esp + 32 + 8] + mov esi, [esp + 32 + 12] + mov ebp, [esp + 32 + 16] + mov ecx, [esp + 32 + 20] + + convertloop : + movzx eax, byte ptr [edi] + lea edi, [edi + 1] + movzx ebx, byte ptr [esi] + lea esi, [esi + 1] + movq mm0, [_kCoefficientsRgbY + 2048 + 8 * eax] + movzx eax, byte ptr [edx] + paddsw mm0, [_kCoefficientsRgbY + 4096 + 8 * ebx] + movzx ebx, byte ptr [edx + 1] + movq mm1, [_kCoefficientsRgbY + 8 * eax] + lea edx, [edx + 2] + movq mm2, [_kCoefficientsRgbY + 8 * ebx] + paddsw mm1, mm0 + paddsw mm2, mm0 + psraw mm1, 6 + psraw mm2, 6 + packuswb mm1, mm2 + movntq [ebp], mm1 + lea ebp, [ebp + 8] + sub ecx, 2 + ja convertloop + + popad + ret + } +} + +__declspec(naked) +void FastConvertYUVToBGRARow(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + __asm { + pushad + mov edx, [esp + 32 + 4] + mov edi, [esp + 32 + 8] + mov esi, [esp + 32 + 12] + mov ebp, [esp + 32 + 16] + mov ecx, [esp + 32 + 20] + + convertloop : + movzx eax, byte ptr [edi] + lea edi, [edi + 1] + movzx ebx, byte ptr [esi] + lea esi, [esi + 1] + movq mm0, [_kCoefficientsBgraY + 2048 + 8 * eax] + movzx eax, byte ptr [edx] + paddsw mm0, [_kCoefficientsBgraY + 4096 + 8 * ebx] + movzx ebx, byte ptr [edx + 1] + movq mm1, [_kCoefficientsBgraY + 8 * eax] + lea edx, [edx + 2] + movq mm2, [_kCoefficientsBgraY + 8 * ebx] + paddsw mm1, mm0 + paddsw mm2, mm0 + psraw mm1, 6 + psraw mm2, 6 + packuswb mm1, mm2 + movntq [ebp], mm1 + lea ebp, [ebp + 8] + sub ecx, 2 + ja convertloop + + popad + ret + } +} + +__declspec(naked) +void FastConvertYUVToABGRRow(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + __asm { + pushad + mov edx, [esp + 32 + 4] + mov edi, [esp + 32 + 8] + mov esi, [esp + 32 + 12] + mov ebp, [esp + 32 + 16] + mov ecx, [esp + 32 + 20] + + convertloop : + movzx eax, byte ptr [edi] + lea edi, [edi + 1] + movzx ebx, byte ptr [esi] + lea esi, [esi + 1] + movq mm0, [_kCoefficientsAbgrY + 2048 + 8 * eax] + movzx eax, byte ptr [edx] + paddsw mm0, [_kCoefficientsAbgrY + 4096 + 8 * ebx] + movzx ebx, byte ptr [edx + 1] + movq mm1, [_kCoefficientsAbgrY + 8 * eax] + lea edx, [edx + 2] + movq mm2, [_kCoefficientsAbgrY + 8 * ebx] + paddsw mm1, mm0 + paddsw mm2, mm0 + psraw mm1, 6 + psraw mm2, 6 + packuswb mm1, mm2 + movntq [ebp], mm1 + lea ebp, [ebp + 8] + sub ecx, 2 + ja convertloop + + popad + ret + } +} + +__declspec(naked) +void FastConvertYUV444ToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + __asm { + pushad + mov edx, [esp + 32 + 4] // Y + mov edi, [esp + 32 + 8] // U + mov esi, [esp + 32 + 12] // V + mov ebp, [esp + 32 + 16] // rgb + mov ecx, [esp + 32 + 20] // width + + convertloop : + movzx eax, byte ptr [edi] + lea edi, [edi + 1] + movzx ebx, byte ptr [esi] + lea esi, [esi + 1] + movq mm0, [_kCoefficientsRgbY + 2048 + 8 * eax] + movzx eax, byte ptr [edx] + paddsw mm0, [_kCoefficientsRgbY + 4096 + 8 * ebx] + lea edx, [edx + 1] + paddsw mm0, [_kCoefficientsRgbY + 8 * eax] + psraw mm0, 6 + packuswb mm0, mm0 + movd [ebp], mm0 + lea ebp, [ebp + 4] + sub ecx, 1 + ja convertloop + + popad + ret + } +} + +__declspec(naked) +void FastConvertYToRGB32Row(const uint8* y_buf, + uint8* rgb_buf, + int width) { + __asm { + push ebx + mov eax, [esp + 4 + 4] // Y + mov edx, [esp + 4 + 8] // rgb + mov ecx, [esp + 4 + 12] // width + + convertloop : + movzx ebx, byte ptr [eax] + movq mm0, [_kCoefficientsRgbY + 8 * ebx] + psraw mm0, 6 + movzx ebx, byte ptr [eax + 1] + movq mm1, [_kCoefficientsRgbY + 8 * ebx] + psraw mm1, 6 + packuswb mm0, mm1 + lea eax, [eax + 2] + movq [edx], mm0 + lea edx, [edx + 8] + sub ecx, 2 + ja convertloop + + pop ebx + ret + } +} + +#endif + +} // extern "C" diff --git a/files/source/scale.cc b/files/source/scale.cc new file mode 100644 index 00000000..d3b7d333 --- /dev/null +++ b/files/source/scale.cc @@ -0,0 +1,3481 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/scale.h" + +#include <assert.h> +#include <string.h> + +#include "libyuv/cpu_id.h" + +#if defined(_MSC_VER) +#define ALIGN16(var) __declspec(align(16)) var +#else +#define ALIGN16(var) var __attribute__((aligned(16))) +#endif + +// Note: A Neon reference manual +// http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/CJAJIIGG.html +// Note: Some SSE2 reference manuals +// cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf + +namespace libyuv { + +// Set the following flag to true to revert to only +// using the reference implementation ScalePlaneBox(), and +// NOT the optimized versions. Useful for debugging and +// when comparing the quality of the resulting YUV planes +// as produced by the optimized and non-optimized versions. + +static bool use_reference_impl_ = false; + +void SetUseReferenceImpl(bool use) { + use_reference_impl_ = use; +} + +/** + * NEON downscalers with interpolation. + * + * Provided by Fritz Koenig + * + */ + +#if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED) +#define HAS_SCALEROWDOWN2_NEON +void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */, + uint8* dst, int dst_width) { + __asm__ volatile + ( + "1:\n" + "vld2.u8 {q0,q1}, [%0]! \n" // load even pixels into q0, odd into q1 + "vst1.u8 {q0}, [%1]! \n" // store even pixels + "subs %2, %2, #16 \n" // 16 processed per loop + "bhi 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1" // Clobber List + ); +} + +void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { + __asm__ volatile + ( + "mov r4, #2 \n" // rounding constant + "add %1, %0 \n" // change the stride to row 2 pointer + "vdup.16 q4, r4 \n" + "1:\n" + "vld1.u8 {q0,q1}, [%0]! \n" // load row 1 and post increment + "vld1.u8 {q2,q3}, [%1]! \n" // load row 2 and post increment + "vpaddl.u8 q0, q0 \n" // row 1 add adjacent + "vpaddl.u8 q1, q1 \n" + "vpadal.u8 q0, q2 \n" // row 2 add adjacent, add row 1 to row 2 + "vpadal.u8 q1, q3 \n" + "vadd.u16 q0, q4 \n" // rounding + "vadd.u16 q1, q4 \n" + "vshrn.u16 d0, q0, #2 \n" // downshift and pack + "vshrn.u16 d1, q1, #2 \n" + "vst1.u8 {q0}, [%2]! \n" + "subs %3, %3, #16 \n" // 16 processed per loop + "bhi 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "r4", "q0", "q1", "q2", "q3", "q4" // Clobber List + ); +} + +#define HAS_SCALEROWDOWN4_NEON +// Expecting widths on arm devices to be smaller. Went with 8x4 blocks +// to get most coverage. Look to back and evaluate 16x4 blocks with +// handling of leftovers. +static void ScaleRowDown4_NEON(const uint8* src_ptr, int /* src_stride */, + uint8* dst_ptr, int dst_width) { + __asm__ volatile + ( + "mov r4, #4 \n" + "1: \n" + "vld1.u8 {d0[0]}, [%0],r4 \n" // load up only 2 pixels of data to + "vld1.u8 {d0[1]}, [%0],r4 \n" // represent the entire 8x4 block + + "vst1.u16 {d0[0]}, [%1]! \n" + + "subs %2, #2 \n" // dst_width -= 2 + "bhi 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "r4", "q0", "q1", "memory", "cc" + ); +} + +static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm__ volatile + ( + "1: \n" + "mov r4, %0 \n" + "vld1.u8 {d0}, [r4],%3 \n" // load up 8x4 block of input data + "vld1.u8 {d1}, [r4],%3 \n" + "vld1.u8 {d2}, [r4],%3 \n" + "vld1.u8 {d3}, [r4] \n" + + // data is loaded up int q0 and q1 + // q0 = a00 a01 a02 a03 b00 b01 b02 b03 a10 a11 a12 a13 b10 b11 b12 b13 + // q1 = a20 a21 a22 a23 b20 b21 b22 b23 a20 a21 a22 a23 b20 b21 b22 b23 + // q0 = a00+a01 a02+a03 b00+b01 b02+b03 a10+a11 a12+a13 b10+b11 b12+b13 + "vpaddl.u8 q0, q0 \n" + + // d0 = a00+a01+a20+a21 a02+a03+a22+a23 b00+b01+b20+b21 b02+b03+b22+b23 + // d1 = a10+a11+a20+a21 a12+a13+a22+a23 b10+b11+b20+b21 b12+b13+b22+b23 + "vpadal.u8 q0, q1 \n" + + // d0 = a00+a01+a20+a21+a02+a03+a22+a23 b00+b01+b20+b21+b02+b03+b22+b23 + // d1 = a10+a11+a20+a21+a12+a13+a22+a23 b10+b11+b20+b21+b12+b13+b22+b23 + "vpaddl.u16 q0, q0 \n" + + + // d0 = a00+a01+a20+a21+a02+a03+a22+a23+a10+a11+a20+a21+a12+a13+a22+a23 + // b00+b01+b20+b21+b02+b03+b22+b23+b10+b11+b20+b21+b12+b13+b22+b23 + "vadd.u32 d0, d1 \n" + + "vrshr.u32 d0, d0, #4 \n" // divide by 16 w/rounding + + "vst1.u8 {d0[0]}, [%1]! \n" + "vst1.u8 {d0[4]}, [%1]! \n" + + "add %0, #8 \n" // move src pointer to next 8 pixels + "subs %2, #2 \n" // dst_width -= 2 + "bhi 1b \n" + + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(src_stride) // %3 + : "r4", "q0", "q1", "memory", "cc" + ); +} + +/** + * SSE2 downscalers with interpolation. + * + * Provided by Frank Barchard (fbarchard@google.com) + * + */ + +// Constants for SSE2 code +#elif (defined(WIN32) || defined(__i386__) || defined(__x86_64__)) && \ + !defined(COVERAGE_ENABLED) && !TARGET_IPHONE_SIMULATOR +#if defined(_MSC_VER) +#define TALIGN16(t, var) __declspec(align(16)) t _ ## var +#elif defined(OSX) +#define TALIGN16(t, var) t var __attribute__((aligned(16))) +#else +#define TALIGN16(t, var) t _ ## var __attribute__((aligned(16))) +#endif + +// Offsets for source bytes 0 to 9 +extern "C" TALIGN16(const uint8, shuf0[16]) = + { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. +extern "C" TALIGN16(const uint8, shuf1[16]) = + { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +extern "C" TALIGN16(const uint8, shuf2[16]) = + { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 0 to 10 +extern "C" TALIGN16(const uint8, shuf01[16]) = + { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; + +// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. +extern "C" TALIGN16(const uint8, shuf11[16]) = + { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +extern "C" TALIGN16(const uint8, shuf21[16]) = + { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; + +// Coefficients for source bytes 0 to 10 +extern "C" TALIGN16(const uint8, madd01[16]) = + { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; + +// Coefficients for source bytes 10 to 21 +extern "C" TALIGN16(const uint8, madd11[16]) = + { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; + +// Coefficients for source bytes 21 to 31 +extern "C" TALIGN16(const uint8, madd21[16]) = + { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; + +// Coefficients for source bytes 21 to 31 +extern "C" TALIGN16(const int16, round34[8]) = + { 2, 2, 2, 2, 2, 2, 2, 2 }; + +extern "C" TALIGN16(const uint8, shuf38a[16]) = + { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; + +extern "C" TALIGN16(const uint8, shuf38b[16]) = + { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; + +// Arrange words 0,3,6 into 0,1,2 +extern "C" TALIGN16(const uint8, shufac0[16]) = + { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Arrange words 0,3,6 into 3,4,5 +extern "C" TALIGN16(const uint8, shufac3[16]) = + { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; + +// Scaling values for boxes of 3x3 and 2x3 +extern "C" TALIGN16(const uint16, scaleac3[8]) = + { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; + +// Arrange first value for pixels 0,1,2,3,4,5 +extern "C" TALIGN16(const uint8, shufab0[16]) = + { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; + +// Arrange second value for pixels 0,1,2,3,4,5 +extern "C" TALIGN16(const uint8, shufab1[16]) = + { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; + +// Arrange third value for pixels 0,1,2,3,4,5 +extern "C" TALIGN16(const uint8, shufab2[16]) = + { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; + +// Scaling values for boxes of 3x2 and 2x2 +extern "C" TALIGN16(const uint16, scaleab2[8]) = + { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; +#endif + +#if defined(WIN32) && !defined(COVERAGE_ENABLED) + +#define HAS_SCALEROWDOWN2_SSE2 +// Reads 32 pixels, throws half away and writes 16 pixels. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) +static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff + psrlw xmm7, 8 + + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm7 + pand xmm1, xmm7 + packuswb xmm0, xmm1 + movdqa [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + ja wloop + + ret + } +} +// Blends 32x2 rectangle to 16x1. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) +static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff + psrlw xmm7, 8 + + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + esi] + movdqa xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + psrlw xmm0, 8 + movdqa xmm3, xmm1 + psrlw xmm1, 8 + pand xmm2, xmm7 + pand xmm3, xmm7 + pavgw xmm0, xmm2 + pavgw xmm1, xmm3 + packuswb xmm0, xmm1 + + movdqa [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + ja wloop + + pop esi + ret + } +} + +#define HAS_SCALEROWDOWN4_SSE2 +// Point samples 32 pixels to 8 pixels. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) +static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + // src_stride ignored + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + pcmpeqb xmm7, xmm7 // generate mask 0x000000ff + psrld xmm7, 24 + + wloop: + movdqa xmm0, [esi] + movdqa xmm1, [esi + 16] + lea esi, [esi + 32] + pand xmm0, xmm7 + pand xmm1, xmm7 + packuswb xmm0, xmm1 + packuswb xmm0, xmm0 + movq qword ptr [edi], xmm0 + lea edi, [edi + 8] + sub ecx, 8 + ja wloop + + popad + ret + } +} + +// Blends 32x4 rectangle to 8x1. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) +static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + mov ebx, [esp + 32 + 8] // src_stride + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff + psrlw xmm7, 8 + lea edx, [ebx + ebx * 2] // src_stride * 3 + + wloop: + movdqa xmm0, [esi] + movdqa xmm1, [esi + 16] + movdqa xmm2, [esi + ebx] + movdqa xmm3, [esi + ebx + 16] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + movdqa xmm2, [esi + ebx * 2] + movdqa xmm3, [esi + ebx * 2 + 16] + movdqa xmm4, [esi + edx] + movdqa xmm5, [esi + edx + 16] + lea esi, [esi + 32] + pavgb xmm2, xmm4 + pavgb xmm3, xmm5 + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + psrlw xmm0, 8 + movdqa xmm3, xmm1 + psrlw xmm1, 8 + pand xmm2, xmm7 + pand xmm3, xmm7 + pavgw xmm0, xmm2 + pavgw xmm1, xmm3 + packuswb xmm0, xmm1 + + movdqa xmm2, xmm0 // average columns (16 to 8 pixels) + psrlw xmm0, 8 + pand xmm2, xmm7 + pavgw xmm0, xmm2 + packuswb xmm0, xmm0 + + movq qword ptr [edi], xmm0 + lea edi, [edi + 8] + sub ecx, 8 + ja wloop + + popad + ret + } +} + +#define HAS_SCALEROWDOWN8_SSE2 +// Point samples 32 pixels to 4 pixels. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned. +__declspec(naked) +static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + // src_stride ignored + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + pcmpeqb xmm7, xmm7 // generate mask isolating 1 src 8 bytes + psrlq xmm7, 56 + + wloop: + movdqa xmm0, [esi] + movdqa xmm1, [esi + 16] + lea esi, [esi + 32] + pand xmm0, xmm7 + pand xmm1, xmm7 + packuswb xmm0, xmm1 // 32->16 + packuswb xmm0, xmm0 // 16->8 + packuswb xmm0, xmm0 // 8->4 + movd dword ptr [edi], xmm0 + lea edi, [edi + 4] + sub ecx, 4 + ja wloop + + popad + ret + } +} + +// Blends 32x8 rectangle to 4x1. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned. +__declspec(naked) +static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + mov ebx, [esp + 32 + 8] // src_stride + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + lea edx, [ebx + ebx * 2] // src_stride * 3 + pxor xmm7, xmm7 + + wloop: + movdqa xmm0, [esi] // average 8 rows to 1 + movdqa xmm1, [esi + 16] + movdqa xmm2, [esi + ebx] + movdqa xmm3, [esi + ebx + 16] + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + movdqa xmm2, [esi + ebx * 2] + movdqa xmm3, [esi + ebx * 2 + 16] + movdqa xmm4, [esi + edx] + movdqa xmm5, [esi + edx + 16] + lea ebp, [esi + ebx * 4] + lea esi, [esi + 32] + pavgb xmm2, xmm4 + pavgb xmm3, xmm5 + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + + movdqa xmm2, [ebp] + movdqa xmm3, [ebp + 16] + movdqa xmm4, [ebp + ebx] + movdqa xmm5, [ebp + ebx + 16] + pavgb xmm2, xmm4 + pavgb xmm3, xmm5 + movdqa xmm4, [ebp + ebx * 2] + movdqa xmm5, [ebp + ebx * 2 + 16] + movdqa xmm6, [ebp + edx] + pavgb xmm4, xmm6 + movdqa xmm6, [ebp + edx + 16] + pavgb xmm5, xmm6 + pavgb xmm2, xmm4 + pavgb xmm3, xmm5 + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + + psadbw xmm0, xmm7 // average 32 pixels to 4 + psadbw xmm1, xmm7 + pshufd xmm0, xmm0, 0xd8 // x1x0 -> xx01 + pshufd xmm1, xmm1, 0x8d // x3x2 -> 32xx + por xmm0, xmm1 // -> 3201 + psrlw xmm0, 3 + packuswb xmm0, xmm0 + packuswb xmm0, xmm0 + movd dword ptr [edi], xmm0 + + lea edi, [edi + 4] + sub ecx, 4 + ja wloop + + popad + ret + } +} + +#define HAS_SCALEROWDOWN34_SSSE3 +// Point samples 32 pixels to 24 pixels. +// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. +// Then shuffled to do the scaling. + +// Note that movdqa+palign may be better than movdqu. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) +static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + // src_stride ignored + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + movdqa xmm3, _shuf0 + movdqa xmm4, _shuf1 + movdqa xmm5, _shuf2 + + wloop: + movdqa xmm0, [esi] + movdqa xmm2, [esi + 16] + lea esi, [esi + 32] + movdqa xmm1, xmm2 + palignr xmm1, xmm0, 8 + pshufb xmm0, xmm3 + pshufb xmm1, xmm4 + pshufb xmm2, xmm5 + movq qword ptr [edi], xmm0 + movq qword ptr [edi + 8], xmm1 + movq qword ptr [edi + 16], xmm2 + lea edi, [edi + 24] + sub ecx, 24 + ja wloop + + popad + ret + } +} + +// Blends 32x2 rectangle to 24x1 +// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. +// Then shuffled to do the scaling. + +// Register usage: +// xmm0 src_row 0 +// xmm1 src_row 1 +// xmm2 shuf 0 +// xmm3 shuf 1 +// xmm4 shuf 2 +// xmm5 madd 0 +// xmm6 madd 1 +// xmm7 round34 + +// Note that movdqa+palign may be better than movdqu. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) +static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + mov ebx, [esp + 32 + 8] // src_stride + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + movdqa xmm2, _shuf01 + movdqa xmm3, _shuf11 + movdqa xmm4, _shuf21 + movdqa xmm5, _madd01 + movdqa xmm6, _madd11 + movdqa xmm7, _round34 + + wloop: + movdqa xmm0, [esi] // pixels 0..7 + movdqa xmm1, [esi+ebx] + pavgb xmm0, xmm1 + pshufb xmm0, xmm2 + pmaddubsw xmm0, xmm5 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edi], xmm0 + movdqu xmm0, [esi+8] // pixels 8..15 + movdqu xmm1, [esi+ebx+8] + pavgb xmm0, xmm1 + pshufb xmm0, xmm3 + pmaddubsw xmm0, xmm6 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edi+8], xmm0 + movdqa xmm0, [esi+16] // pixels 16..23 + movdqa xmm1, [esi+ebx+16] + lea esi, [esi+32] + pavgb xmm0, xmm1 + pshufb xmm0, xmm4 + movdqa xmm1, _madd21 + pmaddubsw xmm0, xmm1 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edi+16], xmm0 + lea edi, [edi+24] + sub ecx, 24 + ja wloop + + popad + ret + } +} + +// Note that movdqa+palign may be better than movdqu. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) +static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + mov ebx, [esp + 32 + 8] // src_stride + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + movdqa xmm2, _shuf01 + movdqa xmm3, _shuf11 + movdqa xmm4, _shuf21 + movdqa xmm5, _madd01 + movdqa xmm6, _madd11 + movdqa xmm7, _round34 + + wloop: + movdqa xmm0, [esi] // pixels 0..7 + movdqa xmm1, [esi+ebx] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm2 + pmaddubsw xmm0, xmm5 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edi], xmm0 + movdqu xmm0, [esi+8] // pixels 8..15 + movdqu xmm1, [esi+ebx+8] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm3 + pmaddubsw xmm0, xmm6 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edi+8], xmm0 + movdqa xmm0, [esi+16] // pixels 16..23 + movdqa xmm1, [esi+ebx+16] + lea esi, [esi+32] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm4 + movdqa xmm1, _madd21 + pmaddubsw xmm0, xmm1 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edi+16], xmm0 + lea edi, [edi+24] + sub ecx, 24 + ja wloop + + popad + ret + } +} + +#define HAS_SCALEROWDOWN38_SSSE3 +// 3/8 point sampler + +// Scale 32 pixels to 12 +__declspec(naked) +static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + mov edx, [esp + 32 + 8] // src_stride + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + movdqa xmm5, _shuf38a + movdqa xmm6, _shuf38b + pxor xmm7, xmm7 + + xloop: + movdqa xmm0, [esi] // 16 pixels -> 0,1,2,3,4,5 + movdqa xmm1, [esi + 16] // 16 pixels -> 6,7,8,9,10,11 + lea esi, [esi + 32] + pshufb xmm0, xmm5 + pshufb xmm1, xmm6 + paddusb xmm0, xmm1 + + movq qword ptr [edi], xmm0 // write 12 pixels + movhlps xmm1, xmm0 + movd [edi + 8], xmm1 + lea edi, [edi + 12] + sub ecx, 12 + ja xloop + + popad + ret + } +} + +// Scale 16x3 pixels to 6x1 with interpolation +__declspec(naked) +static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + mov edx, [esp + 32 + 8] // src_stride + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + movdqa xmm4, _shufac0 + movdqa xmm5, _shufac3 + movdqa xmm6, _scaleac3 + pxor xmm7, xmm7 + + xloop: + movdqa xmm0, [esi] // sum up 3 rows into xmm0/1 + movdqa xmm2, [esi + edx] + movhlps xmm1, xmm0 + movhlps xmm3, xmm2 + punpcklbw xmm0, xmm7 + punpcklbw xmm1, xmm7 + punpcklbw xmm2, xmm7 + punpcklbw xmm3, xmm7 + paddusw xmm0, xmm2 + paddusw xmm1, xmm3 + movdqa xmm2, [esi + edx * 2] + lea esi, [esi + 16] + movhlps xmm3, xmm2 + punpcklbw xmm2, xmm7 + punpcklbw xmm3, xmm7 + paddusw xmm0, xmm2 + paddusw xmm1, xmm3 + + movdqa xmm2, xmm0 // 8 pixels -> 0,1,2 of xmm2 + psrldq xmm0, 2 + paddusw xmm2, xmm0 + psrldq xmm0, 2 + paddusw xmm2, xmm0 + pshufb xmm2, xmm4 + + movdqa xmm3, xmm1 // 8 pixels -> 3,4,5 of xmm2 + psrldq xmm1, 2 + paddusw xmm3, xmm1 + psrldq xmm1, 2 + paddusw xmm3, xmm1 + pshufb xmm3, xmm5 + paddusw xmm2, xmm3 + + pmulhuw xmm2, xmm6 // divide by 9,9,6, 9,9,6 + packuswb xmm2, xmm2 + + movd [edi], xmm2 // write 6 pixels + pextrw eax, xmm2, 2 + mov [edi + 4], ax + lea edi, [edi + 6] + sub ecx, 6 + ja xloop + + popad + ret + } +} + +// Scale 16x2 pixels to 6x1 with interpolation +__declspec(naked) +static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + mov edx, [esp + 32 + 8] // src_stride + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + movdqa xmm4, _shufab0 + movdqa xmm5, _shufab1 + movdqa xmm6, _shufab2 + movdqa xmm7, _scaleab2 + + xloop: + movdqa xmm2, [esi] // average 2 rows into xmm2 + pavgb xmm2, [esi + edx] + lea esi, [esi + 16] + + movdqa xmm0, xmm2 // 16 pixels -> 0,1,2,3,4,5 of xmm0 + pshufb xmm0, xmm4 + movdqa xmm1, xmm2 + pshufb xmm1, xmm5 + paddusw xmm0, xmm1 + pshufb xmm2, xmm6 + paddusw xmm0, xmm2 + + pmulhuw xmm0, xmm7 // divide by 3,3,2, 3,3,2 + packuswb xmm0, xmm0 + + movd [edi], xmm0 // write 6 pixels + pextrw eax, xmm0, 2 + mov [edi + 4], ax + lea edi, [edi + 6] + sub ecx, 6 + ja xloop + + popad + ret + } +} + +#define HAS_SCALEADDROWS_SSE2 + +// Reads 8xN bytes and produces 16 shorts at a time. +__declspec(naked) +static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, + uint16* dst_ptr, int src_width, + int src_height) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + mov edx, [esp + 32 + 8] // src_stride + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + mov ebx, [esp + 32 + 20] // height + pxor xmm7, xmm7 + dec ebx + + xloop: + // first row + movdqa xmm2, [esi] + lea eax, [esi + edx] + movhlps xmm3, xmm2 + mov ebp, ebx + punpcklbw xmm2, xmm7 + punpcklbw xmm3, xmm7 + + // sum remaining rows + yloop: + movdqa xmm0, [eax] // read 16 pixels + lea eax, [eax + edx] // advance to next row + movhlps xmm1, xmm0 + punpcklbw xmm0, xmm7 + punpcklbw xmm1, xmm7 + paddusw xmm2, xmm0 // sum 16 words + paddusw xmm3, xmm1 + sub ebp, 1 + ja yloop + + movdqa [edi], xmm2 + movdqa [edi + 16], xmm3 + lea edi, [edi + 32] + lea esi, [esi + 16] + + sub ecx, 16 + ja xloop + + popad + ret + } +} + +// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version. +#define HAS_SCALEFILTERROWS_SSE2 +__declspec(naked) +static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, + int src_stride, int dst_width, + int source_y_fraction) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr + mov edx, [esp + 8 + 12] // src_stride + mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + cmp eax, 0 + je xloop1 + cmp eax, 128 + je xloop2 + + movd xmm6, eax // xmm6 = y fraction + punpcklwd xmm6, xmm6 + pshufd xmm6, xmm6, 0 + neg eax // xmm5 = 256 - y fraction + add eax, 256 + movd xmm5, eax + punpcklwd xmm5, xmm5 + pshufd xmm5, xmm5, 0 + pxor xmm7, xmm7 + + xloop: + movdqa xmm0, [esi] + movdqa xmm2, [esi + edx] + lea esi, [esi + 16] + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + punpcklbw xmm0, xmm7 + punpcklbw xmm2, xmm7 + punpckhbw xmm1, xmm7 + punpckhbw xmm3, xmm7 + pmullw xmm0, xmm5 // scale row 0 + pmullw xmm1, xmm5 + pmullw xmm2, xmm6 // scale row 1 + pmullw xmm3, xmm6 + paddusw xmm0, xmm2 // sum rows + paddusw xmm1, xmm3 + psrlw xmm0, 8 + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqa [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 16 + ja xloop + + mov al, [edi - 1] + mov [edi], al + pop edi + pop esi + ret + + xloop1: + movdqa xmm0, [esi] + lea esi, [esi + 16] + movdqa [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 16 + ja xloop1 + + mov al, [edi - 1] + mov [edi], al + pop edi + pop esi + ret + + xloop2: + movdqa xmm0, [esi] + movdqa xmm2, [esi + edx] + lea esi, [esi + 16] + pavgb xmm0, xmm2 + movdqa [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 16 + ja xloop2 + + mov al, [edi - 1] + mov [edi], al + pop edi + pop esi + ret + } +} + +// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version. +#define HAS_SCALEFILTERROWS_SSSE3 +__declspec(naked) +static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + int src_stride, int dst_width, + int source_y_fraction) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr + mov edx, [esp + 8 + 12] // src_stride + mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + cmp eax, 0 + je xloop1 + cmp eax, 128 + je xloop2 + + shr eax, 1 + mov ah,al + neg al + add al, 128 + movd xmm7, eax + punpcklwd xmm7, xmm7 + pshufd xmm7, xmm7, 0 + + xloop: + movdqa xmm0, [esi] + movdqa xmm2, [esi + edx] + lea esi, [esi + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm2 + punpckhbw xmm1, xmm2 + pmaddubsw xmm0, xmm7 + pmaddubsw xmm1, xmm7 + psrlw xmm0, 7 + psrlw xmm1, 7 + packuswb xmm0, xmm1 + movdqa [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 16 + ja xloop + + mov al, [edi - 1] + mov [edi], al + pop edi + pop esi + ret + + xloop1: + movdqa xmm0, [esi] + lea esi, [esi + 16] + movdqa [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 16 + ja xloop1 + + mov al, [edi - 1] + mov [edi], al + pop edi + pop esi + ret + + xloop2: + movdqa xmm0, [esi] + movdqa xmm2, [esi + edx] + lea esi, [esi + 16] + pavgb xmm0, xmm2 + movdqa [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 16 + ja xloop2 + + mov al, [edi - 1] + mov [edi], al + pop edi + pop esi + ret + + } +} + +// Note that movdqa+palign may be better than movdqu. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) +static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + int dst_width) { + __asm { + mov edx, [esp + 4] // dst_ptr + mov eax, [esp + 8] // src_ptr + mov ecx, [esp + 12] // dst_width + movdqa xmm1, _round34 + movdqa xmm2, _shuf01 + movdqa xmm3, _shuf11 + movdqa xmm4, _shuf21 + movdqa xmm5, _madd01 + movdqa xmm6, _madd11 + movdqa xmm7, _madd21 + + wloop: + movdqa xmm0, [eax] // pixels 0..7 + pshufb xmm0, xmm2 + pmaddubsw xmm0, xmm5 + paddsw xmm0, xmm1 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx], xmm0 + movdqu xmm0, [eax+8] // pixels 8..15 + pshufb xmm0, xmm3 + pmaddubsw xmm0, xmm6 + paddsw xmm0, xmm1 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx+8], xmm0 + movdqa xmm0, [eax+16] // pixels 16..23 + lea eax, [eax+32] + pshufb xmm0, xmm4 + pmaddubsw xmm0, xmm7 + paddsw xmm0, xmm1 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx+16], xmm0 + lea edx, [edx+24] + sub ecx, 24 + ja wloop + ret + } +} + +#elif (defined(__x86_64__) || defined(__i386__)) && \ + !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) + +// GCC versions of row functions are verbatim conversions from Visual C. +// Generated using gcc disassembly on Visual C object file: +// objdump -D yuvscaler.obj >yuvscaler.txt +#define HAS_SCALEROWDOWN2_SSE2 +static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile( + "pcmpeqb %%xmm7,%%xmm7\n" + "psrlw $0x8,%%xmm7\n" +"1:" + "movdqa (%0),%%xmm0\n" + "movdqa 0x10(%0),%%xmm1\n" + "lea 0x20(%0),%0\n" + "pand %%xmm7,%%xmm0\n" + "pand %%xmm7,%%xmm1\n" + "packuswb %%xmm1,%%xmm0\n" + "movdqa %%xmm0,(%1)\n" + "lea 0x10(%1),%1\n" + "sub $0x10,%2\n" + "ja 1b\n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory" +); +} + +static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile( + "pcmpeqb %%xmm7,%%xmm7\n" + "psrlw $0x8,%%xmm7\n" +"1:" + "movdqa (%0),%%xmm0\n" + "movdqa 0x10(%0),%%xmm1\n" + "movdqa (%0,%3,1),%%xmm2\n" + "movdqa 0x10(%0,%3,1),%%xmm3\n" + "lea 0x20(%0),%0\n" + "pavgb %%xmm2,%%xmm0\n" + "pavgb %%xmm3,%%xmm1\n" + "movdqa %%xmm0,%%xmm2\n" + "psrlw $0x8,%%xmm0\n" + "movdqa %%xmm1,%%xmm3\n" + "psrlw $0x8,%%xmm1\n" + "pand %%xmm7,%%xmm2\n" + "pand %%xmm7,%%xmm3\n" + "pavgw %%xmm2,%%xmm0\n" + "pavgw %%xmm3,%%xmm1\n" + "packuswb %%xmm1,%%xmm0\n" + "movdqa %%xmm0,(%1)\n" + "lea 0x10(%1),%1\n" + "sub $0x10,%2\n" + "ja 1b\n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(static_cast<intptr_t>(src_stride)) // %3 + : "memory" +); +} + +#define HAS_SCALEROWDOWN4_SSE2 +static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile( + "pcmpeqb %%xmm7,%%xmm7\n" + "psrld $0x18,%%xmm7\n" +"1:" + "movdqa (%0),%%xmm0\n" + "movdqa 0x10(%0),%%xmm1\n" + "lea 0x20(%0),%0\n" + "pand %%xmm7,%%xmm0\n" + "pand %%xmm7,%%xmm1\n" + "packuswb %%xmm1,%%xmm0\n" + "packuswb %%xmm0,%%xmm0\n" + "movq %%xmm0,(%1)\n" + "lea 0x8(%1),%1\n" + "sub $0x8,%2\n" + "ja 1b\n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory" +); +} + +static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + intptr_t temp = 0; + asm volatile( + "pcmpeqb %%xmm7,%%xmm7\n" + "psrlw $0x8,%%xmm7\n" + "lea (%4,%4,2),%3\n" +"1:" + "movdqa (%0),%%xmm0\n" + "movdqa 0x10(%0),%%xmm1\n" + "movdqa (%0,%4,1),%%xmm2\n" + "movdqa 0x10(%0,%4,1),%%xmm3\n" + "pavgb %%xmm2,%%xmm0\n" + "pavgb %%xmm3,%%xmm1\n" + "movdqa (%0,%4,2),%%xmm2\n" + "movdqa 0x10(%0,%4,2),%%xmm3\n" + "movdqa (%0,%3,1),%%xmm4\n" + "movdqa 0x10(%0,%3,1),%%xmm5\n" + "lea 0x20(%0),%0\n" + "pavgb %%xmm4,%%xmm2\n" + "pavgb %%xmm2,%%xmm0\n" + "pavgb %%xmm5,%%xmm3\n" + "pavgb %%xmm3,%%xmm1\n" + "movdqa %%xmm0,%%xmm2\n" + "psrlw $0x8,%%xmm0\n" + "movdqa %%xmm1,%%xmm3\n" + "psrlw $0x8,%%xmm1\n" + "pand %%xmm7,%%xmm2\n" + "pand %%xmm7,%%xmm3\n" + "pavgw %%xmm2,%%xmm0\n" + "pavgw %%xmm3,%%xmm1\n" + "packuswb %%xmm1,%%xmm0\n" + "movdqa %%xmm0,%%xmm2\n" + "psrlw $0x8,%%xmm0\n" + "pand %%xmm7,%%xmm2\n" + "pavgw %%xmm2,%%xmm0\n" + "packuswb %%xmm0,%%xmm0\n" + "movq %%xmm0,(%1)\n" + "lea 0x8(%1),%1\n" + "sub $0x8,%2\n" + "ja 1b\n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(temp) // %3 + : "r"(static_cast<intptr_t>(src_stride)) // %4 + : "memory" +); +} + +#define HAS_SCALEROWDOWN8_SSE2 +static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile( + "pcmpeqb %%xmm7,%%xmm7\n" + "psrlq $0x38,%%xmm7\n" +"1:" + "movdqa (%0),%%xmm0\n" + "movdqa 0x10(%0),%%xmm1\n" + "lea 0x20(%0),%0\n" + "pand %%xmm7,%%xmm0\n" + "pand %%xmm7,%%xmm1\n" + "packuswb %%xmm1,%%xmm0\n" + "packuswb %%xmm0,%%xmm0\n" + "packuswb %%xmm0,%%xmm0\n" + "movd %%xmm0,(%1)\n" + "lea 0x4(%1),%1\n" + "sub $0x4,%2\n" + "ja 1b\n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory" +); +} + +#if defined(__i386__) +extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + asm( + ".text\n" +#if defined(OSX) + ".globl _ScaleRowDown8Int_SSE2\n" +"_ScaleRowDown8Int_SSE2:\n" +#else + ".global ScaleRowDown8Int_SSE2\n" +"ScaleRowDown8Int_SSE2:\n" +#endif + "pusha\n" + "mov 0x24(%esp),%esi\n" + "mov 0x28(%esp),%ebx\n" + "mov 0x2c(%esp),%edi\n" + "mov 0x30(%esp),%ecx\n" + "lea (%ebx,%ebx,2),%edx\n" + "pxor %xmm7,%xmm7\n" + +"1:" + "movdqa (%esi),%xmm0\n" + "movdqa 0x10(%esi),%xmm1\n" + "movdqa (%esi,%ebx,1),%xmm2\n" + "movdqa 0x10(%esi,%ebx,1),%xmm3\n" + "pavgb %xmm2,%xmm0\n" + "pavgb %xmm3,%xmm1\n" + "movdqa (%esi,%ebx,2),%xmm2\n" + "movdqa 0x10(%esi,%ebx,2),%xmm3\n" + "movdqa (%esi,%edx,1),%xmm4\n" + "movdqa 0x10(%esi,%edx,1),%xmm5\n" + "lea (%esi,%ebx,4),%ebp\n" + "lea 0x20(%esi),%esi\n" + "pavgb %xmm4,%xmm2\n" + "pavgb %xmm5,%xmm3\n" + "pavgb %xmm2,%xmm0\n" + "pavgb %xmm3,%xmm1\n" + "movdqa 0x0(%ebp),%xmm2\n" + "movdqa 0x10(%ebp),%xmm3\n" + "movdqa 0x0(%ebp,%ebx,1),%xmm4\n" + "movdqa 0x10(%ebp,%ebx,1),%xmm5\n" + "pavgb %xmm4,%xmm2\n" + "pavgb %xmm5,%xmm3\n" + "movdqa 0x0(%ebp,%ebx,2),%xmm4\n" + "movdqa 0x10(%ebp,%ebx,2),%xmm5\n" + "movdqa 0x0(%ebp,%edx,1),%xmm6\n" + "pavgb %xmm6,%xmm4\n" + "movdqa 0x10(%ebp,%edx,1),%xmm6\n" + "pavgb %xmm6,%xmm5\n" + "pavgb %xmm4,%xmm2\n" + "pavgb %xmm5,%xmm3\n" + "pavgb %xmm2,%xmm0\n" + "pavgb %xmm3,%xmm1\n" + "psadbw %xmm7,%xmm0\n" + "psadbw %xmm7,%xmm1\n" + "pshufd $0xd8,%xmm0,%xmm0\n" + "pshufd $0x8d,%xmm1,%xmm1\n" + "por %xmm1,%xmm0\n" + "psrlw $0x3,%xmm0\n" + "packuswb %xmm0,%xmm0\n" + "packuswb %xmm0,%xmm0\n" + "movd %xmm0,(%edi)\n" + "lea 0x4(%edi),%edi\n" + "sub $0x4,%ecx\n" + "ja 1b\n" + "popa\n" + "ret\n" +); + +// fpic is used for magiccam plugin +#if !defined(__PIC__) +#define HAS_SCALEROWDOWN34_SSSE3 +extern "C" void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + asm( + ".text\n" +#if defined(OSX) + ".globl _ScaleRowDown34_SSSE3\n" +"_ScaleRowDown34_SSSE3:\n" +#else + ".global ScaleRowDown34_SSSE3\n" +"ScaleRowDown34_SSSE3:\n" +#endif + "pusha\n" + "mov 0x24(%esp),%esi\n" + "mov 0x2c(%esp),%edi\n" + "mov 0x30(%esp),%ecx\n" + "movdqa _shuf0,%xmm3\n" + "movdqa _shuf1,%xmm4\n" + "movdqa _shuf2,%xmm5\n" + +"1:" + "movdqa (%esi),%xmm0\n" + "movdqa 0x10(%esi),%xmm2\n" + "lea 0x20(%esi),%esi\n" + "movdqa %xmm2,%xmm1\n" + "palignr $0x8,%xmm0,%xmm1\n" + "pshufb %xmm3,%xmm0\n" + "pshufb %xmm4,%xmm1\n" + "pshufb %xmm5,%xmm2\n" + "movq %xmm0,(%edi)\n" + "movq %xmm1,0x8(%edi)\n" + "movq %xmm2,0x10(%edi)\n" + "lea 0x18(%edi),%edi\n" + "sub $0x18,%ecx\n" + "ja 1b\n" + "popa\n" + "ret\n" +); + +extern "C" void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + asm( + ".text\n" +#if defined(OSX) + ".globl _ScaleRowDown34_1_Int_SSSE3\n" +"_ScaleRowDown34_1_Int_SSSE3:\n" +#else + ".global ScaleRowDown34_1_Int_SSSE3\n" +"ScaleRowDown34_1_Int_SSSE3:\n" +#endif + "pusha\n" + "mov 0x24(%esp),%esi\n" + "mov 0x28(%esp),%ebp\n" + "mov 0x2c(%esp),%edi\n" + "mov 0x30(%esp),%ecx\n" + "movdqa _shuf01,%xmm2\n" + "movdqa _shuf11,%xmm3\n" + "movdqa _shuf21,%xmm4\n" + "movdqa _madd01,%xmm5\n" + "movdqa _madd11,%xmm6\n" + "movdqa _round34,%xmm7\n" + +"1:" + "movdqa (%esi),%xmm0\n" + "movdqa (%esi,%ebp),%xmm1\n" + "pavgb %xmm1,%xmm0\n" + "pshufb %xmm2,%xmm0\n" + "pmaddubsw %xmm5,%xmm0\n" + "paddsw %xmm7,%xmm0\n" + "psrlw $0x2,%xmm0\n" + "packuswb %xmm0,%xmm0\n" + "movq %xmm0,(%edi)\n" + "movdqu 0x8(%esi),%xmm0\n" + "movdqu 0x8(%esi,%ebp),%xmm1\n" + "pavgb %xmm1,%xmm0\n" + "pshufb %xmm3,%xmm0\n" + "pmaddubsw %xmm6,%xmm0\n" + "paddsw %xmm7,%xmm0\n" + "psrlw $0x2,%xmm0\n" + "packuswb %xmm0,%xmm0\n" + "movq %xmm0,0x8(%edi)\n" + "movdqa 0x10(%esi),%xmm0\n" + "movdqa 0x10(%esi,%ebp),%xmm1\n" + "lea 0x20(%esi),%esi\n" + "pavgb %xmm1,%xmm0\n" + "pshufb %xmm4,%xmm0\n" + "movdqa _madd21,%xmm1\n" + "pmaddubsw %xmm1,%xmm0\n" + "paddsw %xmm7,%xmm0\n" + "psrlw $0x2,%xmm0\n" + "packuswb %xmm0,%xmm0\n" + "movq %xmm0,0x10(%edi)\n" + "lea 0x18(%edi),%edi\n" + "sub $0x18,%ecx\n" + "ja 1b\n" + + "popa\n" + "ret\n" +); + +extern "C" void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + asm( + ".text\n" +#if defined(OSX) + ".globl _ScaleRowDown34_0_Int_SSSE3\n" +"_ScaleRowDown34_0_Int_SSSE3:\n" +#else + ".global ScaleRowDown34_0_Int_SSSE3\n" +"ScaleRowDown34_0_Int_SSSE3:\n" +#endif + "pusha\n" + "mov 0x24(%esp),%esi\n" + "mov 0x28(%esp),%ebp\n" + "mov 0x2c(%esp),%edi\n" + "mov 0x30(%esp),%ecx\n" + "movdqa _shuf01,%xmm2\n" + "movdqa _shuf11,%xmm3\n" + "movdqa _shuf21,%xmm4\n" + "movdqa _madd01,%xmm5\n" + "movdqa _madd11,%xmm6\n" + "movdqa _round34,%xmm7\n" + +"1:" + "movdqa (%esi),%xmm0\n" + "movdqa (%esi,%ebp,1),%xmm1\n" + "pavgb %xmm0,%xmm1\n" + "pavgb %xmm1,%xmm0\n" + "pshufb %xmm2,%xmm0\n" + "pmaddubsw %xmm5,%xmm0\n" + "paddsw %xmm7,%xmm0\n" + "psrlw $0x2,%xmm0\n" + "packuswb %xmm0,%xmm0\n" + "movq %xmm0,(%edi)\n" + "movdqu 0x8(%esi),%xmm0\n" + "movdqu 0x8(%esi,%ebp,1),%xmm1\n" + "pavgb %xmm0,%xmm1\n" + "pavgb %xmm1,%xmm0\n" + "pshufb %xmm3,%xmm0\n" + "pmaddubsw %xmm6,%xmm0\n" + "paddsw %xmm7,%xmm0\n" + "psrlw $0x2,%xmm0\n" + "packuswb %xmm0,%xmm0\n" + "movq %xmm0,0x8(%edi)\n" + "movdqa 0x10(%esi),%xmm0\n" + "movdqa 0x10(%esi,%ebp,1),%xmm1\n" + "lea 0x20(%esi),%esi\n" + "pavgb %xmm0,%xmm1\n" + "pavgb %xmm1,%xmm0\n" + "pshufb %xmm4,%xmm0\n" + "movdqa _madd21,%xmm1\n" + "pmaddubsw %xmm1,%xmm0\n" + "paddsw %xmm7,%xmm0\n" + "psrlw $0x2,%xmm0\n" + "packuswb %xmm0,%xmm0\n" + "movq %xmm0,0x10(%edi)\n" + "lea 0x18(%edi),%edi\n" + "sub $0x18,%ecx\n" + "ja 1b\n" + "popa\n" + "ret\n" +); + +#define HAS_SCALEROWDOWN38_SSSE3 +extern "C" void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + asm( + ".text\n" +#if defined(OSX) + ".globl _ScaleRowDown38_SSSE3\n" +"_ScaleRowDown38_SSSE3:\n" +#else + ".global ScaleRowDown38_SSSE3\n" +"ScaleRowDown38_SSSE3:\n" +#endif + "pusha\n" + "mov 0x24(%esp),%esi\n" + "mov 0x28(%esp),%edx\n" + "mov 0x2c(%esp),%edi\n" + "mov 0x30(%esp),%ecx\n" + "movdqa _shuf38a ,%xmm5\n" + "movdqa _shuf38b ,%xmm6\n" + "pxor %xmm7,%xmm7\n" + +"1:" + "movdqa (%esi),%xmm0\n" + "movdqa 0x10(%esi),%xmm1\n" + "lea 0x20(%esi),%esi\n" + "pshufb %xmm5,%xmm0\n" + "pshufb %xmm6,%xmm1\n" + "paddusb %xmm1,%xmm0\n" + "movq %xmm0,(%edi)\n" + "movhlps %xmm0,%xmm1\n" + "movd %xmm1,0x8(%edi)\n" + "lea 0xc(%edi),%edi\n" + "sub $0xc,%ecx\n" + "ja 1b\n" + "popa\n" + "ret\n" +); + +extern "C" void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + asm( + ".text\n" +#if defined(OSX) + ".globl _ScaleRowDown38_3_Int_SSSE3\n" +"_ScaleRowDown38_3_Int_SSSE3:\n" +#else + ".global ScaleRowDown38_3_Int_SSSE3\n" +"ScaleRowDown38_3_Int_SSSE3:\n" +#endif + "pusha\n" + "mov 0x24(%esp),%esi\n" + "mov 0x28(%esp),%edx\n" + "mov 0x2c(%esp),%edi\n" + "mov 0x30(%esp),%ecx\n" + "movdqa _shufac0,%xmm4\n" + "movdqa _shufac3,%xmm5\n" + "movdqa _scaleac3,%xmm6\n" + "pxor %xmm7,%xmm7\n" + +"1:" + "movdqa (%esi),%xmm0\n" + "movdqa (%esi,%edx,1),%xmm2\n" + "movhlps %xmm0,%xmm1\n" + "movhlps %xmm2,%xmm3\n" + "punpcklbw %xmm7,%xmm0\n" + "punpcklbw %xmm7,%xmm1\n" + "punpcklbw %xmm7,%xmm2\n" + "punpcklbw %xmm7,%xmm3\n" + "paddusw %xmm2,%xmm0\n" + "paddusw %xmm3,%xmm1\n" + "movdqa (%esi,%edx,2),%xmm2\n" + "lea 0x10(%esi),%esi\n" + "movhlps %xmm2,%xmm3\n" + "punpcklbw %xmm7,%xmm2\n" + "punpcklbw %xmm7,%xmm3\n" + "paddusw %xmm2,%xmm0\n" + "paddusw %xmm3,%xmm1\n" + "movdqa %xmm0,%xmm2\n" + "psrldq $0x2,%xmm0\n" + "paddusw %xmm0,%xmm2\n" + "psrldq $0x2,%xmm0\n" + "paddusw %xmm0,%xmm2\n" + "pshufb %xmm4,%xmm2\n" + "movdqa %xmm1,%xmm3\n" + "psrldq $0x2,%xmm1\n" + "paddusw %xmm1,%xmm3\n" + "psrldq $0x2,%xmm1\n" + "paddusw %xmm1,%xmm3\n" + "pshufb %xmm5,%xmm3\n" + "paddusw %xmm3,%xmm2\n" + "pmulhuw %xmm6,%xmm2\n" + "packuswb %xmm2,%xmm2\n" + "movd %xmm2,(%edi)\n" + "pextrw $0x2,%xmm2,%eax\n" + "mov %ax,0x4(%edi)\n" + "lea 0x6(%edi),%edi\n" + "sub $0x6,%ecx\n" + "ja 1b\n" + "popa\n" + "ret\n" +); + +extern "C" void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + asm( + ".text\n" +#if defined(OSX) + ".globl _ScaleRowDown38_2_Int_SSSE3\n" +"_ScaleRowDown38_2_Int_SSSE3:\n" +#else + ".global ScaleRowDown38_2_Int_SSSE3\n" +"ScaleRowDown38_2_Int_SSSE3:\n" +#endif + "pusha\n" + "mov 0x24(%esp),%esi\n" + "mov 0x28(%esp),%edx\n" + "mov 0x2c(%esp),%edi\n" + "mov 0x30(%esp),%ecx\n" + "movdqa _shufab0,%xmm4\n" + "movdqa _shufab1,%xmm5\n" + "movdqa _shufab2,%xmm6\n" + "movdqa _scaleab2,%xmm7\n" + +"1:" + "movdqa (%esi),%xmm2\n" + "pavgb (%esi,%edx,1),%xmm2\n" + "lea 0x10(%esi),%esi\n" + "movdqa %xmm2,%xmm0\n" + "pshufb %xmm4,%xmm0\n" + "movdqa %xmm2,%xmm1\n" + "pshufb %xmm5,%xmm1\n" + "paddusw %xmm1,%xmm0\n" + "pshufb %xmm6,%xmm2\n" + "paddusw %xmm2,%xmm0\n" + "pmulhuw %xmm7,%xmm0\n" + "packuswb %xmm0,%xmm0\n" + "movd %xmm0,(%edi)\n" + "pextrw $0x2,%xmm0,%eax\n" + "mov %ax,0x4(%edi)\n" + "lea 0x6(%edi),%edi\n" + "sub $0x6,%ecx\n" + "ja 1b\n" + "popa\n" + "ret\n" +); +#endif // __PIC__ + +#define HAS_SCALEADDROWS_SSE2 +extern "C" void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, + uint16* dst_ptr, int src_width, + int src_height); + asm( + ".text\n" +#if defined(OSX) + ".globl _ScaleAddRows_SSE2\n" +"_ScaleAddRows_SSE2:\n" +#else + ".global ScaleAddRows_SSE2\n" +"ScaleAddRows_SSE2:\n" +#endif + "pusha\n" + "mov 0x24(%esp),%esi\n" + "mov 0x28(%esp),%edx\n" + "mov 0x2c(%esp),%edi\n" + "mov 0x30(%esp),%ecx\n" + "mov 0x34(%esp),%ebx\n" + "pxor %xmm7,%xmm7\n" + +"1:" + "movdqa (%esi),%xmm2\n" + "lea (%esi,%edx,1),%eax\n" + "movhlps %xmm2,%xmm3\n" + "lea -0x1(%ebx),%ebp\n" + "punpcklbw %xmm7,%xmm2\n" + "punpcklbw %xmm7,%xmm3\n" + +"2:" + "movdqa (%eax),%xmm0\n" + "lea (%eax,%edx,1),%eax\n" + "movhlps %xmm0,%xmm1\n" + "punpcklbw %xmm7,%xmm0\n" + "punpcklbw %xmm7,%xmm1\n" + "paddusw %xmm0,%xmm2\n" + "paddusw %xmm1,%xmm3\n" + "sub $0x1,%ebp\n" + "ja 2b\n" + + "movdqa %xmm2,(%edi)\n" + "movdqa %xmm3,0x10(%edi)\n" + "lea 0x20(%edi),%edi\n" + "lea 0x10(%esi),%esi\n" + "sub $0x10,%ecx\n" + "ja 1b\n" + "popa\n" + "ret\n" +); + +// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version +#define HAS_SCALEFILTERROWS_SSE2 +extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr, + const uint8* src_ptr, int src_stride, + int dst_width, int source_y_fraction); + asm( + ".text\n" +#if defined(OSX) + ".globl _ScaleFilterRows_SSE2\n" +"_ScaleFilterRows_SSE2:\n" +#else + ".global ScaleFilterRows_SSE2\n" +"ScaleFilterRows_SSE2:\n" +#endif + "push %esi\n" + "push %edi\n" + "mov 0xc(%esp),%edi\n" + "mov 0x10(%esp),%esi\n" + "mov 0x14(%esp),%edx\n" + "mov 0x18(%esp),%ecx\n" + "mov 0x1c(%esp),%eax\n" + "cmp $0x0,%eax\n" + "je 2f\n" + "cmp $0x80,%eax\n" + "je 3f\n" + "movd %eax,%xmm6\n" + "punpcklwd %xmm6,%xmm6\n" + "pshufd $0x0,%xmm6,%xmm6\n" + "neg %eax\n" + "add $0x100,%eax\n" + "movd %eax,%xmm5\n" + "punpcklwd %xmm5,%xmm5\n" + "pshufd $0x0,%xmm5,%xmm5\n" + "pxor %xmm7,%xmm7\n" + +"1:" + "movdqa (%esi),%xmm0\n" + "movdqa (%esi,%edx,1),%xmm2\n" + "lea 0x10(%esi),%esi\n" + "movdqa %xmm0,%xmm1\n" + "movdqa %xmm2,%xmm3\n" + "punpcklbw %xmm7,%xmm0\n" + "punpcklbw %xmm7,%xmm2\n" + "punpckhbw %xmm7,%xmm1\n" + "punpckhbw %xmm7,%xmm3\n" + "pmullw %xmm5,%xmm0\n" + "pmullw %xmm5,%xmm1\n" + "pmullw %xmm6,%xmm2\n" + "pmullw %xmm6,%xmm3\n" + "paddusw %xmm2,%xmm0\n" + "paddusw %xmm3,%xmm1\n" + "psrlw $0x8,%xmm0\n" + "psrlw $0x8,%xmm1\n" + "packuswb %xmm1,%xmm0\n" + "movdqa %xmm0,(%edi)\n" + "lea 0x10(%edi),%edi\n" + "sub $0x10,%ecx\n" + "ja 1b\n" + "mov -0x1(%edi),%al\n" + "mov %al,(%edi)\n" + "pop %edi\n" + "pop %esi\n" + "ret\n" + +"2:" + "movdqa (%esi),%xmm0\n" + "lea 0x10(%esi),%esi\n" + "movdqa %xmm0,(%edi)\n" + "lea 0x10(%edi),%edi\n" + "sub $0x10,%ecx\n" + "ja 2b\n" + + "mov -0x1(%edi),%al\n" + "mov %al,(%edi)\n" + "pop %edi\n" + "pop %esi\n" + "ret\n" + +"3:" + "movdqa (%esi),%xmm0\n" + "movdqa (%esi,%edx,1),%xmm2\n" + "lea 0x10(%esi),%esi\n" + "pavgb %xmm2,%xmm0\n" + "movdqa %xmm0,(%edi)\n" + "lea 0x10(%edi),%edi\n" + "sub $0x10,%ecx\n" + "ja 3b\n" + + "mov -0x1(%edi),%al\n" + "mov %al,(%edi)\n" + "pop %edi\n" + "pop %esi\n" + "ret\n" +); + +// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version +#define HAS_SCALEFILTERROWS_SSSE3 +extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr, + const uint8* src_ptr, int src_stride, + int dst_width, int source_y_fraction); + asm( + ".text\n" +#if defined(OSX) + ".globl _ScaleFilterRows_SSSE3\n" +"_ScaleFilterRows_SSSE3:\n" +#else + ".global ScaleFilterRows_SSSE3\n" +"ScaleFilterRows_SSSE3:\n" +#endif + "push %esi\n" + "push %edi\n" + "mov 0xc(%esp),%edi\n" + "mov 0x10(%esp),%esi\n" + "mov 0x14(%esp),%edx\n" + "mov 0x18(%esp),%ecx\n" + "mov 0x1c(%esp),%eax\n" + "cmp $0x0,%eax\n" + "je 2f\n" + "cmp $0x80,%eax\n" + "je 3f\n" + "shr %eax\n" + "mov %al,%ah\n" + "neg %al\n" + "add $0x80,%al\n" + "movd %eax,%xmm7\n" + "punpcklwd %xmm7,%xmm7\n" + "pshufd $0x0,%xmm7,%xmm7\n" + +"1:" + "movdqa (%esi),%xmm0\n" + "movdqa (%esi,%edx,1),%xmm2\n" + "lea 0x10(%esi),%esi\n" + "movdqa %xmm0,%xmm1\n" + "punpcklbw %xmm2,%xmm0\n" + "punpckhbw %xmm2,%xmm1\n" + "pmaddubsw %xmm7,%xmm0\n" + "pmaddubsw %xmm7,%xmm1\n" + "psrlw $0x7,%xmm0\n" + "psrlw $0x7,%xmm1\n" + "packuswb %xmm1,%xmm0\n" + "movdqa %xmm0,(%edi)\n" + "lea 0x10(%edi),%edi\n" + "sub $0x10,%ecx\n" + "ja 1b\n" + "mov -0x1(%edi),%al\n" + "mov %al,(%edi)\n" + "pop %edi\n" + "pop %esi\n" + "ret\n" + +"2:" + "movdqa (%esi),%xmm0\n" + "lea 0x10(%esi),%esi\n" + "movdqa %xmm0,(%edi)\n" + "lea 0x10(%edi),%edi\n" + "sub $0x10,%ecx\n" + "ja 2b\n" + "mov -0x1(%edi),%al\n" + "mov %al,(%edi)\n" + "pop %edi\n" + "pop %esi\n" + "ret\n" + +"3:" + "movdqa (%esi),%xmm0\n" + "movdqa (%esi,%edx,1),%xmm2\n" + "lea 0x10(%esi),%esi\n" + "pavgb %xmm2,%xmm0\n" + "movdqa %xmm0,(%edi)\n" + "lea 0x10(%edi),%edi\n" + "sub $0x10,%ecx\n" + "ja 3b\n" + "mov -0x1(%edi),%al\n" + "mov %al,(%edi)\n" + "pop %edi\n" + "pop %esi\n" + "ret\n" +); + +#elif defined(__x86_64__) +static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile( + "lea (%3,%3,2),%%r10\n" + "pxor %%xmm7,%%xmm7\n" +"1:" + "movdqa (%0),%%xmm0\n" + "movdqa 0x10(%0),%%xmm1\n" + "movdqa (%0,%3,1),%%xmm2\n" + "movdqa 0x10(%0,%3,1),%%xmm3\n" + "pavgb %%xmm2,%%xmm0\n" + "pavgb %%xmm3,%%xmm1\n" + "movdqa (%0,%3,2),%%xmm2\n" + "movdqa 0x10(%0,%3,2),%%xmm3\n" + "movdqa (%0,%%r10,1),%%xmm4\n" + "movdqa 0x10(%0,%%r10,1),%%xmm5\n" + "lea (%0,%3,4),%%r11\n" + "lea 0x20(%0),%0\n" + "pavgb %%xmm4,%%xmm2\n" + "pavgb %%xmm5,%%xmm3\n" + "pavgb %%xmm2,%%xmm0\n" + "pavgb %%xmm3,%%xmm1\n" + "movdqa 0x0(%%r11),%%xmm2\n" + "movdqa 0x10(%%r11),%%xmm3\n" + "movdqa 0x0(%%r11,%3,1),%%xmm4\n" + "movdqa 0x10(%%r11,%3,1),%%xmm5\n" + "pavgb %%xmm4,%%xmm2\n" + "pavgb %%xmm5,%%xmm3\n" + "movdqa 0x0(%%r11,%3,2),%%xmm4\n" + "movdqa 0x10(%%r11,%3,2),%%xmm5\n" + "movdqa 0x0(%%r11,%%r10,1),%%xmm6\n" + "pavgb %%xmm6,%%xmm4\n" + "movdqa 0x10(%%r11,%%r10,1),%%xmm6\n" + "pavgb %%xmm6,%%xmm5\n" + "pavgb %%xmm4,%%xmm2\n" + "pavgb %%xmm5,%%xmm3\n" + "pavgb %%xmm2,%%xmm0\n" + "pavgb %%xmm3,%%xmm1\n" + "psadbw %%xmm7,%%xmm0\n" + "psadbw %%xmm7,%%xmm1\n" + "pshufd $0xd8,%%xmm0,%%xmm0\n" + "pshufd $0x8d,%%xmm1,%%xmm1\n" + "por %%xmm1,%%xmm0\n" + "psrlw $0x3,%%xmm0\n" + "packuswb %%xmm0,%%xmm0\n" + "packuswb %%xmm0,%%xmm0\n" + "movd %%xmm0,(%1)\n" + "lea 0x4(%1),%1\n" + "sub $0x4,%2\n" + "ja 1b\n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(static_cast<intptr_t>(src_stride)) // %3 + : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7" +); +} + +#define HAS_SCALEROWDOWN34_SSSE3 +static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile( + "movdqa (%3),%%xmm3\n" + "movdqa (%4),%%xmm4\n" + "movdqa (%5),%%xmm5\n" +"1:" + "movdqa (%0),%%xmm0\n" + "movdqa 0x10(%0),%%xmm2\n" + "lea 0x20(%0),%0\n" + "movdqa %%xmm2,%%xmm1\n" + "palignr $0x8,%%xmm0,%%xmm1\n" + "pshufb %%xmm3,%%xmm0\n" + "pshufb %%xmm4,%%xmm1\n" + "pshufb %%xmm5,%%xmm2\n" + "movq %%xmm0,(%1)\n" + "movq %%xmm1,0x8(%1)\n" + "movq %%xmm2,0x10(%1)\n" + "lea 0x18(%1),%1\n" + "sub $0x18,%2\n" + "ja 1b\n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(_shuf0), // %3 + "r"(_shuf1), // %4 + "r"(_shuf2) // %5 + : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +); +} + +static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile( + "movdqa (%4),%%xmm2\n" // _shuf01 + "movdqa (%5),%%xmm3\n" // _shuf11 + "movdqa (%6),%%xmm4\n" // _shuf21 + "movdqa (%7),%%xmm5\n" // _madd01 + "movdqa (%8),%%xmm6\n" // _madd11 + "movdqa (%9),%%xmm7\n" // _round34 + "movdqa (%10),%%xmm8\n" // _madd21 +"1:" + "movdqa (%0),%%xmm0\n" + "movdqa (%0,%3),%%xmm1\n" + "pavgb %%xmm1,%%xmm0\n" + "pshufb %%xmm2,%%xmm0\n" + "pmaddubsw %%xmm5,%%xmm0\n" + "paddsw %%xmm7,%%xmm0\n" + "psrlw $0x2,%%xmm0\n" + "packuswb %%xmm0,%%xmm0\n" + "movq %%xmm0,(%1)\n" + "movdqu 0x8(%0),%%xmm0\n" + "movdqu 0x8(%0,%3),%%xmm1\n" + "pavgb %%xmm1,%%xmm0\n" + "pshufb %%xmm3,%%xmm0\n" + "pmaddubsw %%xmm6,%%xmm0\n" + "paddsw %%xmm7,%%xmm0\n" + "psrlw $0x2,%%xmm0\n" + "packuswb %%xmm0,%%xmm0\n" + "movq %%xmm0,0x8(%1)\n" + "movdqa 0x10(%0),%%xmm0\n" + "movdqa 0x10(%0,%3),%%xmm1\n" + "lea 0x20(%0),%0\n" + "pavgb %%xmm1,%%xmm0\n" + "pshufb %%xmm4,%%xmm0\n" + "pmaddubsw %%xmm8,%%xmm0\n" + "paddsw %%xmm7,%%xmm0\n" + "psrlw $0x2,%%xmm0\n" + "packuswb %%xmm0,%%xmm0\n" + "movq %%xmm0,0x10(%1)\n" + "lea 0x18(%1),%1\n" + "sub $0x18,%2\n" + "ja 1b\n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(static_cast<intptr_t>(src_stride)), // %3 + "r"(_shuf01), // %4 + "r"(_shuf11), // %5 + "r"(_shuf21), // %6 + "r"(_madd01), // %7 + "r"(_madd11), // %8 + "r"(_round34), // %9 + "r"(_madd21) // %10 + : "memory", "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", "xmm8" +); +} + +static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile( + "movdqa (%4),%%xmm2\n" // _shuf01 + "movdqa (%5),%%xmm3\n" // _shuf11 + "movdqa (%6),%%xmm4\n" // _shuf21 + "movdqa (%7),%%xmm5\n" // _madd01 + "movdqa (%8),%%xmm6\n" // _madd11 + "movdqa (%9),%%xmm7\n" // _round34 + "movdqa (%10),%%xmm8\n" // _madd21 +"1:" + "movdqa (%0),%%xmm0\n" + "movdqa (%0,%3,1),%%xmm1\n" + "pavgb %%xmm0,%%xmm1\n" + "pavgb %%xmm1,%%xmm0\n" + "pshufb %%xmm2,%%xmm0\n" + "pmaddubsw %%xmm5,%%xmm0\n" + "paddsw %%xmm7,%%xmm0\n" + "psrlw $0x2,%%xmm0\n" + "packuswb %%xmm0,%%xmm0\n" + "movq %%xmm0,(%1)\n" + "movdqu 0x8(%0),%%xmm0\n" + "movdqu 0x8(%0,%3,1),%%xmm1\n" + "pavgb %%xmm0,%%xmm1\n" + "pavgb %%xmm1,%%xmm0\n" + "pshufb %%xmm3,%%xmm0\n" + "pmaddubsw %%xmm6,%%xmm0\n" + "paddsw %%xmm7,%%xmm0\n" + "psrlw $0x2,%%xmm0\n" + "packuswb %%xmm0,%%xmm0\n" + "movq %%xmm0,0x8(%1)\n" + "movdqa 0x10(%0),%%xmm0\n" + "movdqa 0x10(%0,%3,1),%%xmm1\n" + "lea 0x20(%0),%0\n" + "pavgb %%xmm0,%%xmm1\n" + "pavgb %%xmm1,%%xmm0\n" + "pshufb %%xmm4,%%xmm0\n" + "pmaddubsw %%xmm8,%%xmm0\n" + "paddsw %%xmm7,%%xmm0\n" + "psrlw $0x2,%%xmm0\n" + "packuswb %%xmm0,%%xmm0\n" + "movq %%xmm0,0x10(%1)\n" + "lea 0x18(%1),%1\n" + "sub $0x18,%2\n" + "ja 1b\n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(static_cast<intptr_t>(src_stride)), // %3 + "r"(_shuf01), // %4 + "r"(_shuf11), // %5 + "r"(_shuf21), // %6 + "r"(_madd01), // %7 + "r"(_madd11), // %8 + "r"(_round34), // %9 + "r"(_madd21) // %10 + : "memory", "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", "xmm8" +); +} + +#define HAS_SCALEROWDOWN38_SSSE3 +static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile( + "movdqa (%3),%%xmm5\n" + "movdqa (%4),%%xmm6\n" + "pxor %%xmm7,%%xmm7\n" +"1:" + "movdqa (%0),%%xmm0\n" + "movdqa 0x10(%0),%%xmm1\n" + "lea 0x20(%0),%0\n" + "pshufb %%xmm5,%%xmm0\n" + "pshufb %%xmm6,%%xmm1\n" + "paddusb %%xmm1,%%xmm0\n" + "movq %%xmm0,(%1)\n" + "movhlps %%xmm0,%%xmm1\n" + "movd %%xmm1,0x8(%1)\n" + "lea 0xc(%1),%1\n" + "sub $0xc,%2\n" + "ja 1b\n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(_shuf38a), // %3 + "r"(_shuf38b) // %4 + : "memory", "xmm0", "xmm1", "xmm5", "xmm6", "xmm7" +); +} + +static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile( + "movdqa (%4),%%xmm4\n" + "movdqa (%5),%%xmm5\n" + "movdqa (%6),%%xmm6\n" + "pxor %%xmm7,%%xmm7\n" +"1:" + "movdqa (%0),%%xmm0\n" + "movdqa (%0,%3,1),%%xmm2\n" + "movhlps %%xmm0,%%xmm1\n" + "movhlps %%xmm2,%%xmm3\n" + "punpcklbw %%xmm7,%%xmm0\n" + "punpcklbw %%xmm7,%%xmm1\n" + "punpcklbw %%xmm7,%%xmm2\n" + "punpcklbw %%xmm7,%%xmm3\n" + "paddusw %%xmm2,%%xmm0\n" + "paddusw %%xmm3,%%xmm1\n" + "movdqa (%0,%3,2),%%xmm2\n" + "lea 0x10(%0),%0\n" + "movhlps %%xmm2,%%xmm3\n" + "punpcklbw %%xmm7,%%xmm2\n" + "punpcklbw %%xmm7,%%xmm3\n" + "paddusw %%xmm2,%%xmm0\n" + "paddusw %%xmm3,%%xmm1\n" + "movdqa %%xmm0,%%xmm2\n" + "psrldq $0x2,%%xmm0\n" + "paddusw %%xmm0,%%xmm2\n" + "psrldq $0x2,%%xmm0\n" + "paddusw %%xmm0,%%xmm2\n" + "pshufb %%xmm4,%%xmm2\n" + "movdqa %%xmm1,%%xmm3\n" + "psrldq $0x2,%%xmm1\n" + "paddusw %%xmm1,%%xmm3\n" + "psrldq $0x2,%%xmm1\n" + "paddusw %%xmm1,%%xmm3\n" + "pshufb %%xmm5,%%xmm3\n" + "paddusw %%xmm3,%%xmm2\n" + "pmulhuw %%xmm6,%%xmm2\n" + "packuswb %%xmm2,%%xmm2\n" + "movd %%xmm2,(%1)\n" + "pextrw $0x2,%%xmm2,%%eax\n" + "mov %%ax,0x4(%1)\n" + "lea 0x6(%1),%1\n" + "sub $0x6,%2\n" + "ja 1b\n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(static_cast<intptr_t>(src_stride)), // %3 + "r"(_shufac0), // %4 + "r"(_shufac3), // %5 + "r"(_scaleac3) // %6 + : "memory", "rax", "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7" +); +} + +static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile( + "movdqa (%4),%%xmm4\n" + "movdqa (%5),%%xmm5\n" + "movdqa (%6),%%xmm6\n" + "movdqa (%7),%%xmm7\n" +"1:" + "movdqa (%0),%%xmm2\n" + "pavgb (%0,%3,1),%%xmm2\n" + "lea 0x10(%0),%0\n" + "movdqa %%xmm2,%%xmm0\n" + "pshufb %%xmm4,%%xmm0\n" + "movdqa %%xmm2,%%xmm1\n" + "pshufb %%xmm5,%%xmm1\n" + "paddusw %%xmm1,%%xmm0\n" + "pshufb %%xmm6,%%xmm2\n" + "paddusw %%xmm2,%%xmm0\n" + "pmulhuw %%xmm7,%%xmm0\n" + "packuswb %%xmm0,%%xmm0\n" + "movd %%xmm0,(%1)\n" + "pextrw $0x2,%%xmm0,%%eax\n" + "mov %%ax,0x4(%1)\n" + "lea 0x6(%1),%1\n" + "sub $0x6,%2\n" + "ja 1b\n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(static_cast<intptr_t>(src_stride)), // %3 + "r"(_shufab0), // %4 + "r"(_shufab1), // %5 + "r"(_shufab2), // %6 + "r"(_scaleab2) // %7 + : "memory", "rax", "xmm0", "xmm1", "xmm2", + "xmm4", "xmm5", "xmm6", "xmm7" +); +} + +#define HAS_SCALEADDROWS_SSE2 +static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, + uint16* dst_ptr, int src_width, + int src_height) { + asm volatile( + "pxor %%xmm7,%%xmm7\n" +"1:" + "movdqa (%0),%%xmm2\n" + "lea (%0,%4,1),%%r10\n" + "movhlps %%xmm2,%%xmm3\n" + "lea -0x1(%3),%%r11\n" + "punpcklbw %%xmm7,%%xmm2\n" + "punpcklbw %%xmm7,%%xmm3\n" + +"2:" + "movdqa (%%r10),%%xmm0\n" + "lea (%%r10,%4,1),%%r10\n" + "movhlps %%xmm0,%%xmm1\n" + "punpcklbw %%xmm7,%%xmm0\n" + "punpcklbw %%xmm7,%%xmm1\n" + "paddusw %%xmm0,%%xmm2\n" + "paddusw %%xmm1,%%xmm3\n" + "sub $0x1,%%r11\n" + "ja 2b\n" + + "movdqa %%xmm2,(%1)\n" + "movdqa %%xmm3,0x10(%1)\n" + "lea 0x20(%1),%1\n" + "lea 0x10(%0),%0\n" + "sub $0x10,%2\n" + "ja 1b\n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_width), // %2 + "+r"(src_height) // %3 + : "r"(static_cast<intptr_t>(src_stride)) // %4 + : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7" +); +} + +// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version +#define HAS_SCALEFILTERROWS_SSE2 +static void ScaleFilterRows_SSE2(uint8* dst_ptr, + const uint8* src_ptr, int src_stride, + int dst_width, int source_y_fraction) { + if (source_y_fraction == 0) { + asm volatile( + "1:" + "movdqa (%1),%%xmm0\n" + "lea 0x10(%1),%1\n" + "movdqa %%xmm0,(%0)\n" + "lea 0x10(%0),%0\n" + "sub $0x10,%2\n" + "ja 1b\n" + "mov -0x1(%0),%%al\n" + "mov %%al,(%0)\n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "rax", "xmm0" + ); + return; + } else if (source_y_fraction == 128) { + asm volatile( + "1:" + "movdqa (%1),%%xmm0\n" + "movdqa (%1,%3,1),%%xmm2\n" + "lea 0x10(%1),%1\n" + "pavgb %%xmm2,%%xmm0\n" + "movdqa %%xmm0,(%0)\n" + "lea 0x10(%0),%0\n" + "sub $0x10,%2\n" + "ja 1b\n" + "mov -0x1(%0),%%al\n" + "mov %%al,(%0)\n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(static_cast<intptr_t>(src_stride)) // %3 + : "memory", "rax", "xmm0", "xmm2" + ); + return; + } else { + asm volatile( + "mov %3,%%eax\n" + "movd %%eax,%%xmm6\n" + "punpcklwd %%xmm6,%%xmm6\n" + "pshufd $0x0,%%xmm6,%%xmm6\n" + "neg %%eax\n" + "add $0x100,%%eax\n" + "movd %%eax,%%xmm5\n" + "punpcklwd %%xmm5,%%xmm5\n" + "pshufd $0x0,%%xmm5,%%xmm5\n" + "pxor %%xmm7,%%xmm7\n" + "1:" + "movdqa (%1),%%xmm0\n" + "movdqa (%1,%4,1),%%xmm2\n" + "lea 0x10(%1),%1\n" + "movdqa %%xmm0,%%xmm1\n" + "movdqa %%xmm2,%%xmm3\n" + "punpcklbw %%xmm7,%%xmm0\n" + "punpcklbw %%xmm7,%%xmm2\n" + "punpckhbw %%xmm7,%%xmm1\n" + "punpckhbw %%xmm7,%%xmm3\n" + "pmullw %%xmm5,%%xmm0\n" + "pmullw %%xmm5,%%xmm1\n" + "pmullw %%xmm6,%%xmm2\n" + "pmullw %%xmm6,%%xmm3\n" + "paddusw %%xmm2,%%xmm0\n" + "paddusw %%xmm3,%%xmm1\n" + "psrlw $0x8,%%xmm0\n" + "psrlw $0x8,%%xmm1\n" + "packuswb %%xmm1,%%xmm0\n" + "movdqa %%xmm0,(%0)\n" + "lea 0x10(%0),%0\n" + "sub $0x10,%2\n" + "ja 1b\n" + "mov -0x1(%0),%%al\n" + "mov %%al,(%0)\n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(source_y_fraction) // %3 + : "r"(static_cast<intptr_t>(src_stride)) // %4 + : "memory", "rax", "xmm0", "xmm1", "xmm2", "xmm3", + "xmm5", "xmm6", "xmm7" + ); + } + return; +} + +// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version +#define HAS_SCALEFILTERROWS_SSSE3 +static void ScaleFilterRows_SSSE3(uint8* dst_ptr, + const uint8* src_ptr, int src_stride, + int dst_width, int source_y_fraction) { + if (source_y_fraction == 0) { + asm volatile( + "1:" + "movdqa (%1),%%xmm0\n" + "lea 0x10(%1),%1\n" + "movdqa %%xmm0,(%0)\n" + "lea 0x10(%0),%0\n" + "sub $0x10,%2\n" + "ja 1b\n" + "mov -0x1(%0),%%al\n" + "mov %%al,(%0)\n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "rax", "xmm0" + ); + return; + } else if (source_y_fraction == 128) { + asm volatile( + "1:" + "movdqa (%1),%%xmm0\n" + "movdqa (%1,%3,1),%%xmm2\n" + "lea 0x10(%1),%1\n" + "pavgb %%xmm2,%%xmm0\n" + "movdqa %%xmm0,(%0)\n" + "lea 0x10(%0),%0\n" + "sub $0x10,%2\n" + "ja 1b\n" + "mov -0x1(%0),%%al\n" + "mov %%al,(%0)\n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(static_cast<intptr_t>(src_stride)) // %3 + : "memory", "rax", "xmm0", "xmm2" + ); + return; + } else { + asm volatile( + "mov %3,%%eax\n" + "shr %%eax\n" + "mov %%al,%%ah\n" + "neg %%al\n" + "add $0x80,%%al\n" + "movd %%eax,%%xmm7\n" + "punpcklwd %%xmm7,%%xmm7\n" + "pshufd $0x0,%%xmm7,%%xmm7\n" + "1:" + "movdqa (%1),%%xmm0\n" + "movdqa (%1,%4,1),%%xmm2\n" + "lea 0x10(%1),%1\n" + "movdqa %%xmm0,%%xmm1\n" + "punpcklbw %%xmm2,%%xmm0\n" + "punpckhbw %%xmm2,%%xmm1\n" + "pmaddubsw %%xmm7,%%xmm0\n" + "pmaddubsw %%xmm7,%%xmm1\n" + "psrlw $0x7,%%xmm0\n" + "psrlw $0x7,%%xmm1\n" + "packuswb %%xmm1,%%xmm0\n" + "movdqa %%xmm0,(%0)\n" + "lea 0x10(%0),%0\n" + "sub $0x10,%2\n" + "ja 1b\n" + "mov -0x1(%0),%%al\n" + "mov %%al,(%0)\n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(source_y_fraction) // %3 + : "r"(static_cast<intptr_t>(src_stride)) // %4 + : "memory", "rax", "xmm0", "xmm1", "xmm2", "xmm7" + ); + } + return; +} +#endif +#endif + +// CPU agnostic row functions +static void ScaleRowDown2_C(const uint8* src_ptr, int, + uint8* dst, int dst_width) { + for (int x = 0; x < dst_width; ++x) { + *dst++ = *src_ptr; + src_ptr += 2; + } +} + +static void ScaleRowDown2Int_C(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { + for (int x = 0; x < dst_width; ++x) { + *dst++ = (src_ptr[0] + src_ptr[1] + + src_ptr[src_stride] + src_ptr[src_stride + 1] + 2) >> 2; + src_ptr += 2; + } +} + +static void ScaleRowDown4_C(const uint8* src_ptr, int, + uint8* dst, int dst_width) { + for (int x = 0; x < dst_width; ++x) { + *dst++ = *src_ptr; + src_ptr += 4; + } +} + +static void ScaleRowDown4Int_C(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { + for (int x = 0; x < dst_width; ++x) { + *dst++ = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + + src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + + src_ptr[src_stride + 2] + src_ptr[src_stride + 3] + + src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] + + src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] + + src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] + + src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] + + 8) >> 4; + src_ptr += 4; + } +} + +// 640 output pixels is enough to allow 5120 input pixels with 1/8 scale down. +// Keeping the total buffer under 4096 bytes avoids a stackcheck, saving 4% cpu. +static const int kMaxOutputWidth = 640; +static const int kMaxRow12 = kMaxOutputWidth * 2; + +static void ScaleRowDown8_C(const uint8* src_ptr, int, + uint8* dst, int dst_width) { + for (int x = 0; x < dst_width; ++x) { + *dst++ = *src_ptr; + src_ptr += 8; + } +} + +// Note calling code checks width is less than max and if not +// uses ScaleRowDown8_C instead. +static void ScaleRowDown8Int_C(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { + ALIGN16(uint8 src_row[kMaxRow12 * 2]); + assert(dst_width <= kMaxOutputWidth); + ScaleRowDown4Int_C(src_ptr, src_stride, src_row, dst_width * 2); + ScaleRowDown4Int_C(src_ptr + src_stride * 4, src_stride, + src_row + kMaxOutputWidth, + dst_width * 2); + ScaleRowDown2Int_C(src_row, kMaxOutputWidth, dst, dst_width); +} + +static void ScaleRowDown34_C(const uint8* src_ptr, int, + uint8* dst, int dst_width) { + assert((dst_width % 3 == 0) && (dst_width > 0)); + uint8* dend = dst + dst_width; + do { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[1]; + dst[2] = src_ptr[3]; + dst += 3; + src_ptr += 4; + } while (dst < dend); +} + +// Filter rows 0 and 1 together, 3 : 1 +static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride, + uint8* d, int dst_width) { + assert((dst_width % 3 == 0) && (dst_width > 0)); + uint8* dend = d + dst_width; + const uint8* s = src_ptr; + const uint8* t = src_ptr + src_stride; + do { + uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 * 3 + b0 + 2) >> 2; + d[1] = (a1 * 3 + b1 + 2) >> 2; + d[2] = (a2 * 3 + b2 + 2) >> 2; + d += 3; + s += 4; + t += 4; + } while (d < dend); +} + +// Filter rows 1 and 2 together, 1 : 1 +static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride, + uint8* d, int dst_width) { + assert((dst_width % 3 == 0) && (dst_width > 0)); + uint8* dend = d + dst_width; + const uint8* s = src_ptr; + const uint8* t = src_ptr + src_stride; + do { + uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 + b0 + 1) >> 1; + d[1] = (a1 + b1 + 1) >> 1; + d[2] = (a2 + b2 + 1) >> 1; + d += 3; + s += 4; + t += 4; + } while (d < dend); +} + +#if defined(HAS_SCALEFILTERROWS_SSE2) +// Filter row to 3/4 +static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width) { + assert((dst_width % 3 == 0) && (dst_width > 0)); + uint8* dend = dst_ptr + dst_width; + const uint8* s = src_ptr; + do { + dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2; + dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1; + dst_ptr[2] = (s[2] * 1 + s[3] * 3 + 2) >> 2; + dst_ptr += 3; + s += 4; + } while (dst_ptr < dend); +} +#endif + +static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int dx) { + int x = 0; + for (int j = 0; j < dst_width; ++j) { + int xi = x >> 16; + int xf1 = x & 0xffff; + int xf0 = 65536 - xf1; + + *dst_ptr++ = (src_ptr[xi] * xf0 + src_ptr[xi + 1] * xf1) >> 16; + x += dx; + } +} + +static const int kMaxInputWidth = 2560; +#if defined(HAS_SCALEFILTERROWS_SSE2) +#define HAS_SCALEROWDOWN34_SSE2 +// Filter rows 0 and 1 together, 3 : 1 +static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + assert((dst_width % 3 == 0) && (dst_width > 0)); + ALIGN16(uint8 row[kMaxInputWidth]); + ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, + 256 / 4); + ScaleFilterCols34_C(dst_ptr, row, dst_width); +} + +// Filter rows 1 and 2 together, 1 : 1 +static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + assert((dst_width % 3 == 0) && (dst_width > 0)); + ALIGN16(uint8 row[kMaxInputWidth]); + ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 2); + ScaleFilterCols34_C(dst_ptr, row, dst_width); +} +#endif + +static void ScaleRowDown38_C(const uint8* src_ptr, int, + uint8* dst, int dst_width) { + assert(dst_width % 3 == 0); + for (int x = 0; x < dst_width; x += 3) { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[3]; + dst[2] = src_ptr[6]; + dst += 3; + src_ptr += 8; + } +} + +// 8x3 -> 3x1 +static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (int i = 0; i < dst_width; i+=3) { + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + + src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + + src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] + + src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) * + (65536 / 9) >> 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + + src_ptr[src_stride + 3] + src_ptr[src_stride + 4] + + src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] + + src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) * + (65536 / 9) >> 16; + dst_ptr[2] = (src_ptr[6] + src_ptr[7] + + src_ptr[src_stride + 6] + src_ptr[src_stride + 7] + + src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7]) * + (65536 / 6) >> 16; + src_ptr += 8; + dst_ptr += 3; + } +} + +// 8x2 -> 3x1 +static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (int i = 0; i < dst_width; i+=3) { + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + + src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + + src_ptr[src_stride + 2]) * (65536 / 6) >> 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + + src_ptr[src_stride + 3] + src_ptr[src_stride + 4] + + src_ptr[src_stride + 5]) * (65536 / 6) >> 16; + dst_ptr[2] = (src_ptr[6] + src_ptr[7] + + src_ptr[src_stride + 6] + src_ptr[src_stride + 7]) * + (65536 / 4) >> 16; + src_ptr += 8; + dst_ptr += 3; + } +} + +// C version 8x2 -> 8x1 +static void ScaleFilterRows_C(uint8* dst_ptr, + const uint8* src_ptr, int src_stride, + int dst_width, int source_y_fraction) { + assert(dst_width > 0); + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint8* src_ptr1 = src_ptr + src_stride; + uint8* end = dst_ptr + dst_width; + do { + dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; + dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8; + dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8; + dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8; + dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8; + dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8; + dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8; + dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8; + src_ptr += 8; + src_ptr1 += 8; + dst_ptr += 8; + } while (dst_ptr < end); + dst_ptr[0] = dst_ptr[-1]; +} + +void ScaleAddRows_C(const uint8* src_ptr, int src_stride, + uint16* dst_ptr, int src_width, int src_height) { + assert(src_width > 0); + assert(src_height > 0); + for (int x = 0; x < src_width; ++x) { + const uint8* s = src_ptr + x; + int sum = 0; + for (int y = 0; y < src_height; ++y) { + sum += s[0]; + s += src_stride; + } + dst_ptr[x] = sum; + } +} + +/** + * Scale plane, 1/2 + * + * This is an optimized version for scaling down a plane to 1/2 of + * its original size. + * + */ +static void ScalePlaneDown2(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { + assert(src_width % 2 == 0); + assert(src_height % 2 == 0); + void (*ScaleRowDown2)(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + +#if defined(HAS_SCALEROWDOWN2_NEON) + if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) && + (dst_width % 16 == 0) && (src_stride % 16 == 0) && + (dst_stride % 16 == 0) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 16)) { + ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON; + } else +#endif +#if defined(HAS_SCALEROWDOWN2_SSE2) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && + (dst_width % 16 == 0) && IS_ALIGNED(src_ptr, 16) && + IS_ALIGNED(dst_ptr, 16)) { + ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2; + } else +#endif + { + ScaleRowDown2 = filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C; + } + + for (int y = 0; y < dst_height; ++y) { + ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += (src_stride << 1); + dst_ptr += dst_stride; + } +} + +/** + * Scale plane, 1/4 + * + * This is an optimized version for scaling down a plane to 1/4 of + * its original size. + */ +static void ScalePlaneDown4(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { + assert(src_width % 4 == 0); + assert(src_height % 4 == 0); + void (*ScaleRowDown4)(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + +#if defined(HAS_SCALEROWDOWN4_NEON) + if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) && + (dst_width % 2 == 0) && (src_stride % 8 == 0) && + IS_ALIGNED(src_ptr, 8)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Int_NEON : ScaleRowDown4_NEON; + } else +#endif +#if defined(HAS_SCALEROWDOWN4_SSE2) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && + (dst_width % 8 == 0) && (src_stride % 16 == 0) && + (dst_stride % 8 == 0) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2; + } else +#endif + { + ScaleRowDown4 = filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C; + } + + for (int y = 0; y < dst_height; ++y) { + ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += (src_stride << 2); + dst_ptr += dst_stride; + } +} + +/** + * Scale plane, 1/8 + * + * This is an optimized version for scaling down a plane to 1/8 + * of its original size. + * + */ +static void ScalePlaneDown8(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { + assert(src_width % 8 == 0); + assert(src_height % 8 == 0); + void (*ScaleRowDown8)(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); +#if defined(HAS_SCALEROWDOWN8_SSE2) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && + (dst_width % 16 == 0) && dst_width <= kMaxOutputWidth && + (src_stride % 16 == 0) && (dst_stride % 16 == 0) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 16)) { + ScaleRowDown8 = filtering ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2; + } else +#endif + { + ScaleRowDown8 = filtering && (dst_width <= kMaxOutputWidth) ? + ScaleRowDown8Int_C : ScaleRowDown8_C; + } + for (int y = 0; y < dst_height; ++y) { + ScaleRowDown8(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += (src_stride << 3); + dst_ptr += dst_stride; + } +} + +/** + * Scale plane down, 3/4 + * + * Provided by Frank Barchard (fbarchard@google.com) + * + */ +static void ScalePlaneDown34(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { + assert(dst_width % 3 == 0); + void (*ScaleRowDown34_0)(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + void (*ScaleRowDown34_1)(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); +#if defined(HAS_SCALEROWDOWN34_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (dst_width % 24 == 0) && (src_stride % 16 == 0) && + (dst_stride % 8 == 0) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_SSSE3; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3; + } + } else +#endif +#if defined(HAS_SCALEROWDOWN34_SSE2) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && + (dst_width % 24 == 0) && (src_stride % 16 == 0) && + (dst_stride % 8 == 0) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8) && + filtering) { + ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2; + ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2; + } else +#endif + { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_C; + ScaleRowDown34_1 = ScaleRowDown34_C; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Int_C; + ScaleRowDown34_1 = ScaleRowDown34_1_Int_C; + } + } + int src_row = 0; + for (int y = 0; y < dst_height; ++y) { + switch (src_row) { + case 0: + ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width); + break; + + case 1: + ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width); + break; + + case 2: + ScaleRowDown34_0(src_ptr + src_stride, -src_stride, + dst_ptr, dst_width); + break; + } + ++src_row; + src_ptr += src_stride; + dst_ptr += dst_stride; + if (src_row >= 3) { + src_ptr += src_stride; + src_row = 0; + } + } +} + +/** + * Scale plane, 3/8 + * + * This is an optimized version for scaling down a plane to 3/8 + * of its original size. + * + * Reduces 16x3 to 6x1 + */ +static void ScalePlaneDown38(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { + assert(dst_width % 3 == 0); + void (*ScaleRowDown38_3)(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + void (*ScaleRowDown38_2)(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); +#if defined(HAS_SCALEROWDOWN38_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (dst_width % 24 == 0) && (src_stride % 16 == 0) && + (dst_stride % 8 == 0) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_SSSE3; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Int_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3; + } + } else +#endif + { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_C; + ScaleRowDown38_2 = ScaleRowDown38_C; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Int_C; + ScaleRowDown38_2 = ScaleRowDown38_2_Int_C; + } + } + int src_row = 0; + for (int y = 0; y < dst_height; ++y) { + switch (src_row) { + case 0: + case 1: + ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + ++src_row; + break; + + case 2: + ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += src_stride * 2; + src_row = 0; + break; + } + dst_ptr += dst_stride; + } +} + +inline static uint32 SumBox(int iboxwidth, int iboxheight, + int src_stride, const uint8* src_ptr) { + assert(iboxwidth > 0); + assert(iboxheight > 0); + uint32 sum = 0u; + for (int y = 0; y < iboxheight; ++y) { + for (int x = 0; x < iboxwidth; ++x) { + sum += src_ptr[x]; + } + src_ptr += src_stride; + } + return sum; +} + +static void ScalePlaneBoxRow(int dst_width, int boxheight, + int dx, int src_stride, + const uint8* src_ptr, uint8* dst_ptr) { + int x = 0; + for (int i = 0; i < dst_width; ++i) { + int ix = x >> 16; + x += dx; + int boxwidth = (x >> 16) - ix; + *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) / + (boxwidth * boxheight); + } +} + +inline static uint32 SumPixels(int iboxwidth, const uint16* src_ptr) { + assert(iboxwidth > 0); + uint32 sum = 0u; + for (int x = 0; x < iboxwidth; ++x) { + sum += src_ptr[x]; + } + return sum; +} + +static void ScaleAddCols2_C(int dst_width, int boxheight, int dx, + const uint16* src_ptr, uint8* dst_ptr) { + int scaletbl[2]; + int minboxwidth = (dx >> 16); + scaletbl[0] = 65536 / (minboxwidth * boxheight); + scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight); + int *scaleptr = scaletbl - minboxwidth; + int x = 0; + for (int i = 0; i < dst_width; ++i) { + int ix = x >> 16; + x += dx; + int boxwidth = (x >> 16) - ix; + *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16; + } +} + +static void ScaleAddCols1_C(int dst_width, int boxheight, int dx, + const uint16* src_ptr, uint8* dst_ptr) { + int boxwidth = (dx >> 16); + int scaleval = 65536 / (boxwidth * boxheight); + int x = 0; + for (int i = 0; i < dst_width; ++i) { + *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16; + x += boxwidth; + } +} + +/** + * Scale plane down to any dimensions, with interpolation. + * (boxfilter). + * + * Same method as SimpleScale, which is fixed point, outputting + * one pixel of destination using fixed point (16.16) to step + * through source, sampling a box of pixel with simple + * averaging. + */ +static void ScalePlaneBox(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr) { + assert(dst_width > 0); + assert(dst_height > 0); + int dy = (src_height << 16) / dst_height; + int dx = (src_width << 16) / dst_width; + if ((src_width % 16 != 0) || (src_width > kMaxInputWidth) || + dst_height * 2 > src_height) { + uint8* dst = dst_ptr; + int dy = (src_height << 16) / dst_height; + int dx = (src_width << 16) / dst_width; + int y = 0; + for (int j = 0; j < dst_height; ++j) { + int iy = y >> 16; + const uint8* const src = src_ptr + iy * src_stride; + y += dy; + if (y > (src_height << 16)) { + y = (src_height << 16); + } + int boxheight = (y >> 16) - iy; + ScalePlaneBoxRow(dst_width, boxheight, + dx, src_stride, + src, dst); + + dst += dst_stride; + } + } else { + ALIGN16(uint16 row[kMaxInputWidth]); + void (*ScaleAddRows)(const uint8* src_ptr, int src_stride, + uint16* dst_ptr, int src_width, int src_height); + void (*ScaleAddCols)(int dst_width, int boxheight, int dx, + const uint16* src_ptr, uint8* dst_ptr); +#if defined(HAS_SCALEADDROWS_SSE2) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && + (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) && + (src_width % 16) == 0) { + ScaleAddRows = ScaleAddRows_SSE2; + } else +#endif + { + ScaleAddRows = ScaleAddRows_C; + } + if (dx & 0xffff) { + ScaleAddCols = ScaleAddCols2_C; + } else { + ScaleAddCols = ScaleAddCols1_C; + } + + int y = 0; + for (int j = 0; j < dst_height; ++j) { + int iy = y >> 16; + const uint8* const src = src_ptr + iy * src_stride; + y += dy; + if (y > (src_height << 16)) { + y = (src_height << 16); + } + int boxheight = (y >> 16) - iy; + ScaleAddRows(src, src_stride, row, src_width, boxheight); + ScaleAddCols(dst_width, boxheight, dx, row, dst_ptr); + dst_ptr += dst_stride; + } + } +} + +/** + * Scale plane to/from any dimensions, with interpolation. + */ +static void ScalePlaneBilinearSimple(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr) { + uint8* dst = dst_ptr; + int dx = (src_width << 16) / dst_width; + int dy = (src_height << 16) / dst_height; + int maxx = ((src_width - 1) << 16) - 1; + int maxy = ((src_height - 1) << 16) - 1; + int y = (dst_height < src_height) ? 32768 : + (src_height << 16) / dst_height - 32768; + for (int i = 0; i < dst_height; ++i) { + int cy = (y < 0) ? 0 : y; + int yi = cy >> 16; + int yf = cy & 0xffff; + const uint8* const src = src_ptr + yi * src_stride; + int x = (dst_width < src_width) ? 32768 : + (src_width << 16) / dst_width - 32768; + for (int j = 0; j < dst_width; ++j) { + int cx = (x < 0) ? 0 : x; + int xi = cx >> 16; + int xf = cx & 0xffff; + int r0 = (src[xi] * (65536 - xf) + src[xi + 1] * xf) >> 16; + int r1 = (src[xi + src_stride] * (65536 - xf) + + src[xi + src_stride + 1] * xf) >> 16; + *dst++ = (r0 * (65536 - yf) + r1 * yf) >> 16; + x += dx; + if (x > maxx) + x = maxx; + } + dst += dst_stride - dst_width; + y += dy; + if (y > maxy) + y = maxy; + } +} + +/** + * Scale plane to/from any dimensions, with bilinear + * interpolation. + */ +static void ScalePlaneBilinear(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr) { + assert(dst_width > 0); + assert(dst_height > 0); + int dy = (src_height << 16) / dst_height; + int dx = (src_width << 16) / dst_width; + if ((src_width % 8 != 0) || (src_width > kMaxInputWidth)) { + ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src_ptr, dst_ptr); + + } else { + ALIGN16(uint8 row[kMaxInputWidth + 1]); + void (*ScaleFilterRows)(uint8* dst_ptr, const uint8* src_ptr, + int src_stride, + int dst_width, int source_y_fraction); + void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int dx); +#if defined(HAS_SCALEFILTERROWS_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) && + (src_width % 16) == 0) { + ScaleFilterRows = ScaleFilterRows_SSSE3; + } else +#endif +#if defined(HAS_SCALEFILTERROWS_SSE2) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && + (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) && + (src_width % 16) == 0) { + ScaleFilterRows = ScaleFilterRows_SSE2; + } else +#endif + { + ScaleFilterRows = ScaleFilterRows_C; + } + ScaleFilterCols = ScaleFilterCols_C; + + int y = 0; + int maxy = ((src_height - 1) << 16) - 1; // max is filter of last 2 rows. + for (int j = 0; j < dst_height; ++j) { + int iy = y >> 16; + int fy = (y >> 8) & 255; + const uint8* const src = src_ptr + iy * src_stride; + ScaleFilterRows(row, src, src_stride, src_width, fy); + ScaleFilterCols(dst_ptr, row, dst_width, dx); + dst_ptr += dst_stride; + y += dy; + if (y > maxy) { + y = maxy; + } + } + } +} + +/** + * Scale plane to/from any dimensions, without interpolation. + * Fixed point math is used for performance: The upper 16 bits + * of x and dx is the integer part of the source position and + * the lower 16 bits are the fixed decimal part. + */ +static void ScalePlaneSimple(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr) { + uint8* dst = dst_ptr; + int dx = (src_width << 16) / dst_width; + for (int y = 0; y < dst_height; ++y) { + const uint8* const src = src_ptr + (y * src_height / dst_height) * + src_stride; + // TODO(fbarchard): Round X coordinate by setting x=0x8000. + int x = 0; + for (int i = 0; i < dst_width; ++i) { + *dst++ = src[x >> 16]; + x += dx; + } + dst += dst_stride - dst_width; + } +} + +/** + * Scale plane to/from any dimensions. + */ +static void ScalePlaneAnySize(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { + if (!filtering) { + ScalePlaneSimple(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src_ptr, dst_ptr); + } else { + // fall back to non-optimized version + ScalePlaneBilinear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src_ptr, dst_ptr); + } +} + +/** + * Scale plane down, any size + * + * This is an optimized version for scaling down a plane to any size. + * The current implementation is ~10 times faster compared to the + * reference implementation for e.g. XGA->LowResPAL + * + */ +static void ScalePlaneDown(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { + if (!filtering) { + ScalePlaneSimple(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src_ptr, dst_ptr); + } else if (filtering == kFilterBilinear || src_height * 2 > dst_height) { + // between 1/2x and 1x use bilinear + ScalePlaneBilinear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src_ptr, dst_ptr); + } else { + ScalePlaneBox(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src_ptr, dst_ptr); + } +} + +/** + * Copy plane, no scaling + * + * This simply copies the given plane without scaling. + * The current implementation is ~115 times faster + * compared to the reference implementation. + * + */ +static void CopyPlane(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr) { + if (src_stride == src_width && dst_stride == dst_width) { + // All contiguous, so can use REALLY fast path. + memcpy(dst_ptr, src_ptr, src_width * src_height); + } else { + // Not all contiguous; must copy scanlines individually + const uint8* src = src_ptr; + uint8* dst = dst_ptr; + for (int i = 0; i < src_height; ++i) { + memcpy(dst, src, src_width); + dst += dst_stride; + src += src_stride; + } + } +} + +static void ScalePlane(const uint8* src, int src_stride, + int src_width, int src_height, + uint8* dst, int dst_stride, + int dst_width, int dst_height, + FilterMode filtering, bool use_ref) { + // Use specialized scales to improve performance for common resolutions. + // For example, all the 1/2 scalings will use ScalePlaneDown2() + if (dst_width == src_width && dst_height == src_height) { + // Straight copy. + CopyPlane(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst); + } else if (dst_width <= src_width && dst_height <= src_height) { + // Scale down. + if (use_ref) { + // For testing, allow the optimized versions to be disabled. + ScalePlaneDown(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + } else if (4 * dst_width == 3 * src_width && + 4 * dst_height == 3 * src_height) { + // optimized, 3/4 + ScalePlaneDown34(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + } else if (2 * dst_width == src_width && 2 * dst_height == src_height) { + // optimized, 1/2 + ScalePlaneDown2(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + // 3/8 rounded up for odd sized chroma height. + } else if (8 * dst_width == 3 * src_width && + dst_height == ((src_height * 3 + 7) / 8)) { + // optimized, 3/8 + ScalePlaneDown38(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + } else if (4 * dst_width == src_width && 4 * dst_height == src_height) { + // optimized, 1/4 + ScalePlaneDown4(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + } else if (8 * dst_width == src_width && 8 * dst_height == src_height) { + // optimized, 1/8 + ScalePlaneDown8(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + } else { + // Arbitrary downsample + ScalePlaneDown(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + } + } else { + // Arbitrary scale up and/or down. + ScalePlaneAnySize(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + } +} + +/** + * Scale a plane. + * + * This function in turn calls a scaling function + * suitable for handling the desired resolutions. + * + */ + +int I420Scale(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + int src_width, int src_height, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int dst_width, int dst_height, + FilterMode filtering) { + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { + return -1; + } + // Negative height means invert the image. + if (src_height < 0) { + src_height = -src_height; + int halfheight = (src_height + 1) >> 1; + src_y = src_y + (src_height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + int halfsrc_width = (src_width + 1) >> 1; + int halfsrc_height = (src_height + 1) >> 1; + int halfdst_width = (dst_width + 1) >> 1; + int halfoheight = (dst_height + 1) >> 1; + + ScalePlane(src_y, src_stride_y, src_width, src_height, + dst_y, dst_stride_y, dst_width, dst_height, + filtering, use_reference_impl_); + ScalePlane(src_u, src_stride_u, halfsrc_width, halfsrc_height, + dst_u, dst_stride_u, halfdst_width, halfoheight, + filtering, use_reference_impl_); + ScalePlane(src_v, src_stride_v, halfsrc_width, halfsrc_height, + dst_v, dst_stride_v, halfdst_width, halfoheight, + filtering, use_reference_impl_); + return 0; +} + +int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v, + int src_stride_y, int src_stride_u, int src_stride_v, + int src_width, int src_height, + uint8* dst_y, uint8* dst_u, uint8* dst_v, + int dst_stride_y, int dst_stride_u, int dst_stride_v, + int dst_width, int dst_height, + bool interpolate) { + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { + return -1; + } + // Negative height means invert the image. + if (src_height < 0) { + src_height = -src_height; + int halfheight = (src_height + 1) >> 1; + src_y = src_y + (src_height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + int halfsrc_width = (src_width + 1) >> 1; + int halfsrc_height = (src_height + 1) >> 1; + int halfdst_width = (dst_width + 1) >> 1; + int halfoheight = (dst_height + 1) >> 1; + FilterMode filtering = interpolate ? kFilterBox : kFilterNone; + + ScalePlane(src_y, src_stride_y, src_width, src_height, + dst_y, dst_stride_y, dst_width, dst_height, + filtering, use_reference_impl_); + ScalePlane(src_u, src_stride_u, halfsrc_width, halfsrc_height, + dst_u, dst_stride_u, halfdst_width, halfoheight, + filtering, use_reference_impl_); + ScalePlane(src_v, src_stride_v, halfsrc_width, halfsrc_height, + dst_v, dst_stride_v, halfdst_width, halfoheight, + filtering, use_reference_impl_); + return 0; +} + +int Scale(const uint8* src, int src_width, int src_height, + uint8* dst, int dst_width, int dst_height, int ooffset, + bool interpolate) { + if (!src || src_width <= 0 || src_height <= 0 || + !dst || dst_width <= 0 || dst_height <= 0 || ooffset < 0 || + ooffset >= dst_height) { + return -1; + } + ooffset = ooffset & ~1; // chroma requires offset to multiple of 2. + int halfsrc_width = (src_width + 1) >> 1; + int halfsrc_height = (src_height + 1) >> 1; + int halfdst_width = (dst_width + 1) >> 1; + int halfoheight = (dst_height + 1) >> 1; + int aheight = dst_height - ooffset * 2; // actual output height + const uint8* const iyptr = src; + uint8* oyptr = dst + ooffset * dst_width; + const uint8* const iuptr = src + src_width * src_height; + uint8* ouptr = dst + dst_width * dst_height + (ooffset >> 1) * halfdst_width; + const uint8* const ivptr = src + src_width * src_height + + halfsrc_width * halfsrc_height; + uint8* ovptr = dst + dst_width * dst_height + halfdst_width * halfoheight + + (ooffset >> 1) * halfdst_width; + return Scale(iyptr, iuptr, ivptr, src_width, halfsrc_width, halfsrc_width, + src_width, src_height, oyptr, ouptr, ovptr, dst_width, + halfdst_width, halfdst_width, dst_width, aheight, interpolate); +} + +} // namespace libyuv diff --git a/files/source/video_common.cc b/files/source/video_common.cc new file mode 100644 index 00000000..8b8ee622 --- /dev/null +++ b/files/source/video_common.cc @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "video_common.h" + +#include <sstream> + +namespace libyuv { + +#define ARRAY_SIZE(x) (static_cast<int>((sizeof(x)/sizeof(x[0])))) + +struct FourCCAliasEntry { + uint32 alias; + uint32 canonical; +}; + +static const FourCCAliasEntry kFourCCAliases[] = { + {FOURCC_IYUV, FOURCC_I420}, + {FOURCC_YU12, FOURCC_I420}, + {FOURCC_YUYV, FOURCC_YUY2}, + {FOURCC_YUVS, FOURCC_YUY2}, + {FOURCC_HDYC, FOURCC_UYVY}, + {FOURCC_2VUY, FOURCC_UYVY}, + {FOURCC_BA81, FOURCC_BGGR}, + {FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not. + {FOURCC_RGB3, FOURCC_RAW}, + {FOURCC_BGR3, FOURCC_24BG}, +}; + +uint32 CanonicalFourCC(uint32 fourcc) { + for (int i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) { + if (kFourCCAliases[i].alias == fourcc) { + return kFourCCAliases[i].canonical; + } + } + // Not an alias, so return it as-is. + return fourcc; +} + +} // namespace libyuv diff --git a/files/source/video_common.h b/files/source/video_common.h new file mode 100644 index 00000000..9fe08a03 --- /dev/null +++ b/files/source/video_common.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/* +* Common definitions for video, including fourcc and VideoFormat +*/ + + +#ifndef LIBYUV_SOURCE_VIDEO_COMMON_H_ +#define LIBYUV_SOURCE_VIDEO_COMMON_H_ + +#include <string> + +#include "libyuv/basic_types.h" + +namespace libyuv { + +////////////////////////////////////////////////////////////////////////////// +// Definition of fourcc. +////////////////////////////////////////////////////////////////////////////// +// Convert four characters to a fourcc code. +// Needs to be a macro otherwise the OS X compiler complains when the kFormat* +// constants are used in a switch. +#define FOURCC(a, b, c, d) (\ + (static_cast<uint32>(a)) | (static_cast<uint32>(b) << 8) | \ + (static_cast<uint32>(c) << 16) | (static_cast<uint32>(d) << 24)) + +// Some good pages discussing FourCC codes: +// http://developer.apple.com/quicktime/icefloe/dispatch020.html +// http://www.fourcc.org/yuv.php +enum FourCC { + // Canonical fourcc codes used in our code. + FOURCC_I420 = FOURCC('I', '4', '2', '0'), + FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'), + FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'), + FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'), + FOURCC_M420 = FOURCC('M', '4', '2', '0'), + FOURCC_24BG = FOURCC('2', '4', 'B', 'G'), + FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'), + FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'), + FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'), + FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'), + FOURCC_RAW = FOURCC('r', 'a', 'w', ' '), + FOURCC_NV21 = FOURCC('N', 'V', '2', '1'), + FOURCC_NV12 = FOURCC('N', 'V', '1', '2'), + // Next four are Bayer RGB formats. The four characters define the order of + // the colours in each 2x2 pixel grid, going left-to-right and top-to-bottom. + FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'), + FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'), + FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'), + FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'), + + // Aliases for canonical fourcc codes, replaced with their canonical + // equivalents by CanonicalFourCC(). + FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420 + FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'), // Alias for I420 + FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'), // Alias for YUY2 + FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'), // Alias for YUY2 on Mac + FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'), // Alias for UYVY + FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'), // Alias for UYVY + FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'), // Alias for MJPG + FOURCC_BA81 = FOURCC('B', 'A', '8', '1'), // Alias for BGGR + FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'), // Alias for RAW + FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'), // Alias for 24BG + + // Match any fourcc. + FOURCC_ANY = 0xFFFFFFFF, +}; + +// Converts fourcc aliases into canonical ones. +uint32 CanonicalFourCC(uint32 fourcc); + +} // namespace libyuv + +#endif // LIBYUV_SOURCE_VIDEO_COMMON_H_ diff --git a/files/unit_test/rotate_test.cc b/files/unit_test/rotate_test.cc new file mode 100644 index 00000000..1c295b08 --- /dev/null +++ b/files/unit_test/rotate_test.cc @@ -0,0 +1,1519 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/rotate.h" +#include "../source/rotate_priv.h" +#include "unit_test.h" +#include <stdlib.h> +#include <time.h> + +using namespace libyuv; + +void print_array(uint8 *array, int w, int h) { + int i, j; + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) + printf("%4d", (signed char)array[(i * w) + j]); + + printf("\n"); + } +} + +TEST_F(libyuvTest, Transpose) { + int iw, ih, ow, oh; + int err = 0; + + for (iw = 8; iw < _rotate_max_w && !err; ++iw) + for (ih = 8; ih < _rotate_max_h && !err; ++ih) { + int i; + uint8 *input; + uint8 *output_1; + uint8 *output_2; + + ow = ih; + oh = iw; + + input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); + output_1 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + output_2 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); + + for (i = 0; i < (iw * ih); ++i) + input[i] = i; + + TransposePlane(input, iw, output_1, ow, iw, ih); + TransposePlane(output_1, ow, output_2, oh, ow, oh); + + for (i = 0; i < (iw * ih); ++i) { + if (input[i] != output_2[i]) + err++; + } + + if (err) { + printf("input %dx%d \n", iw, ih); + print_array(input, iw, ih); + + printf("transpose 1\n"); + print_array(output_1, ow, oh); + + printf("transpose 2\n"); + print_array(output_2, iw, ih); + } + + free(input); + free(output_1); + free(output_2); + } + + EXPECT_EQ(0, err); +} + +TEST_F(libyuvTest, TransposeUV) { + int iw, ih, ow, oh; + int err = 0; + + for (iw = 16; iw < _rotate_max_w && !err; iw += 2) + for (ih = 8; ih < _rotate_max_h && !err; ++ih) { + int i; + uint8 *input; + uint8 *output_a1, *output_b1; + uint8 *output_a2, *output_b2; + + ow = ih; + oh = iw >> 1; + + input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); + output_a1 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + output_b1 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + output_a2 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); + output_b2 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); + + for (i = 0; i < (iw * ih); i += 2) { + input[i] = i >> 1; + input[i + 1] = -(i >> 1); + } + + TransposeUV(input, iw, output_a1, ow, output_b1, ow, iw >> 1, ih); + + TransposePlane(output_a1, ow, output_a2, oh, ow, oh); + TransposePlane(output_b1, ow, output_b2, oh, ow, oh); + + for (i = 0; i < (iw * ih); i += 2) { + if (input[i] != output_a2[i >> 1]) + err++; + if (input[i + 1] != output_b2[i >> 1]) + err++; + } + + if (err) { + printf("input %dx%d \n", iw, ih); + print_array(input, iw, ih); + + printf("transpose 1\n"); + print_array(output_a1, ow, oh); + print_array(output_b1, ow, oh); + + printf("transpose 2\n"); + print_array(output_a2, oh, ow); + print_array(output_b2, oh, ow); + } + + free(input); + free(output_a1); + free(output_b1); + free(output_a2); + free(output_b2); + } + + EXPECT_EQ(0, err); +} + +TEST_F(libyuvTest, RotatePlane90) { + int iw, ih, ow, oh; + int err = 0; + + for (iw = 8; iw < _rotate_max_w && !err; ++iw) + for (ih = 8; ih < _rotate_max_h && !err; ++ih) { + int i; + uint8 *input; + uint8 *output_0; + uint8 *output_90; + uint8 *output_180; + uint8 *output_270; + + ow = ih; + oh = iw; + + input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); + output_0 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); + output_90 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + output_180 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); + output_270 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + + for (i = 0; i < (iw * ih); ++i) + input[i] = i; + + RotatePlane90(input, iw, output_90, ow, iw, ih); + RotatePlane90(output_90, ow, output_180, oh, ow, oh); + RotatePlane90(output_180, oh, output_270, ow, oh, ow); + RotatePlane90(output_270, ow, output_0, iw, ow, oh); + + for (i = 0; i < (iw * ih); ++i) { + if (input[i] != output_0[i]) + err++; + } + + if (err) { + printf("input %dx%d \n", iw, ih); + print_array(input, iw, ih); + + printf("output 90\n"); + print_array(output_90, ow, oh); + + printf("output 180\n"); + print_array(output_180, iw, ih); + + printf("output 270\n"); + print_array(output_270, ow, oh); + + printf("output 0\n"); + print_array(output_0, iw, ih); + } + + free(input); + free(output_0); + free(output_90); + free(output_180); + free(output_270); + } + + EXPECT_EQ(0, err); +} + +TEST_F(libyuvTest, RotateUV90) { + int iw, ih, ow, oh; + int err = 0; + + for (iw = 16; iw < _rotate_max_w && !err; iw += 2) + for (ih = 8; ih < _rotate_max_h && !err; ++ih) { + int i; + uint8 *input; + uint8 *output_0_u; + uint8 *output_0_v; + uint8 *output_90_u; + uint8 *output_90_v; + uint8 *output_180_u; + uint8 *output_180_v; + + ow = ih; + oh = iw >> 1; + + input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); + output_0_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + output_0_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + output_90_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + output_90_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + output_180_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + output_180_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + + for (i = 0; i < (iw * ih); i += 2) { + input[i] = i >> 1; + input[i + 1] = -(i >> 1); + } + + RotateUV90(input, iw, output_90_u, ow, output_90_v, ow, iw >> 1, ih); + + RotatePlane90(output_90_u, ow, output_180_u, oh, ow, oh); + RotatePlane90(output_90_v, ow, output_180_v, oh, ow, oh); + + RotatePlane180(output_180_u, ow, output_0_u, ow, ow, oh); + RotatePlane180(output_180_v, ow, output_0_v, ow, ow, oh); + + for (i = 0; i < (ow * oh); ++i) { + if (output_0_u[i] != (uint8)i) + err++; + if (output_0_v[i] != (uint8)(-i)) + err++; + } + + if (err) { + printf("input %dx%d \n", iw, ih); + print_array(input, iw, ih); + + printf("output 90_u\n"); + print_array(output_90_u, ow, oh); + + printf("output 90_v\n"); + print_array(output_90_v, ow, oh); + + printf("output 180_u\n"); + print_array(output_180_u, oh, ow); + + printf("output 180_v\n"); + print_array(output_180_v, oh, ow); + + printf("output 0_u\n"); + print_array(output_0_u, oh, ow); + + printf("output 0_v\n"); + print_array(output_0_v, oh, ow); + } + + free(input); + free(output_0_u); + free(output_0_v); + free(output_90_u); + free(output_90_v); + free(output_180_u); + free(output_180_v); + } + + EXPECT_EQ(0, err); +} + +TEST_F(libyuvTest, RotateUV180) { + int iw, ih, ow, oh; + int err = 0; + + for (iw = 16; iw < _rotate_max_w && !err; iw += 2) + for (ih = 8; ih < _rotate_max_h && !err; ++ih) { + int i; + uint8 *input; + uint8 *output_0_u; + uint8 *output_0_v; + uint8 *output_90_u; + uint8 *output_90_v; + uint8 *output_180_u; + uint8 *output_180_v; + + ow = iw >> 1; + oh = ih; + + input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); + output_0_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + output_0_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + output_90_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + output_90_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + output_180_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + output_180_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + + for (i = 0; i < (iw * ih); i += 2) { + input[i] = i >> 1; + input[i + 1] = -(i >> 1); + } + + RotateUV180(input, iw, output_180_u, ow, output_180_v, ow, iw >> 1, ih); + + RotatePlane90(output_180_u, ow, output_90_u, oh, ow, oh); + RotatePlane90(output_180_v, ow, output_90_v, oh, ow, oh); + + RotatePlane90(output_90_u, oh, output_0_u, ow, oh, ow); + RotatePlane90(output_90_v, oh, output_0_v, ow, oh, ow); + + for (i = 0; i < (ow * oh); ++i) { + if (output_0_u[i] != (uint8)i) + err++; + if (output_0_v[i] != (uint8)(-i)) + err++; + } + + if (err) { + printf("input %dx%d \n", iw, ih); + print_array(input, iw, ih); + + printf("output 180_u\n"); + print_array(output_180_u, oh, ow); + + printf("output 180_v\n"); + print_array(output_180_v, oh, ow); + + printf("output 90_u\n"); + print_array(output_90_u, oh, ow); + + printf("output 90_v\n"); + print_array(output_90_v, oh, ow); + + printf("output 0_u\n"); + print_array(output_0_u, ow, oh); + + printf("output 0_v\n"); + print_array(output_0_v, ow, oh); + } + + free(input); + free(output_0_u); + free(output_0_v); + free(output_90_u); + free(output_90_v); + free(output_180_u); + free(output_180_v); + } + + EXPECT_EQ(0, err); +} + +TEST_F(libyuvTest, RotateUV270) { + int iw, ih, ow, oh; + int err = 0; + + for (iw = 16; iw < _rotate_max_w && !err; iw += 2) + for (ih = 8; ih < _rotate_max_h && !err; ++ih) { + int i; + uint8 *input; + uint8 *output_0_u; + uint8 *output_0_v; + uint8 *output_270_u; + uint8 *output_270_v; + uint8 *output_180_u; + uint8 *output_180_v; + + ow = ih; + oh = iw >> 1; + + input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); + output_0_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + output_0_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + output_270_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + output_270_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + output_180_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + output_180_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + + for (i = 0; i < (iw * ih); i += 2) { + input[i] = i >> 1; + input[i + 1] = -(i >> 1); + } + + RotateUV270(input, iw, output_270_u, ow, output_270_v, ow, + iw >> 1, ih); + + RotatePlane270(output_270_u, ow, output_180_u, oh, ow, oh); + RotatePlane270(output_270_v, ow, output_180_v, oh, ow, oh); + + RotatePlane180(output_180_u, ow, output_0_u, ow, ow, oh); + RotatePlane180(output_180_v, ow, output_0_v, ow, ow, oh); + + for (i = 0; i < (ow * oh); ++i) { + if (output_0_u[i] != (uint8)i) + err++; + if (output_0_v[i] != (uint8)(-i)) + err++; + } + + if (err) { + printf("input %dx%d \n", iw, ih); + print_array(input, iw, ih); + + printf("output 270_u\n"); + print_array(output_270_u, ow, oh); + + printf("output 270_v\n"); + print_array(output_270_v, ow, oh); + + printf("output 180_u\n"); + print_array(output_180_u, oh, ow); + + printf("output 180_v\n"); + print_array(output_180_v, oh, ow); + + printf("output 0_u\n"); + print_array(output_0_u, oh, ow); + + printf("output 0_v\n"); + print_array(output_0_v, oh, ow); + } + + free(input); + free(output_0_u); + free(output_0_v); + free(output_270_u); + free(output_270_v); + free(output_180_u); + free(output_180_v); + } + + EXPECT_EQ(0, err); +} + +TEST_F(libyuvTest, RotatePlane180) { + int iw, ih, ow, oh; + int err = 0; + + for (iw = 8; iw < _rotate_max_w && !err; ++iw) + for (ih = 8; ih < _rotate_max_h && !err; ++ih) { + int i; + uint8 *input; + uint8 *output_0; + uint8 *output_180; + + ow = iw; + oh = ih; + + input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); + output_0 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); + output_180 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); + + for (i = 0; i < (iw * ih); ++i) + input[i] = i; + + RotatePlane180(input, iw, output_180, ow, iw, ih); + RotatePlane180(output_180, ow, output_0, iw, ow, oh); + + for (i = 0; i < (iw * ih); ++i) { + if (input[i] != output_0[i]) + err++; + } + + if (err) { + printf("input %dx%d \n", iw, ih); + print_array(input, iw, ih); + + printf("output 180\n"); + print_array(output_180, iw, ih); + + printf("output 0\n"); + print_array(output_0, iw, ih); + } + + free(input); + free(output_0); + free(output_180); + } + + EXPECT_EQ(0, err); +} + +TEST_F(libyuvTest, RotatePlane270) { + int iw, ih, ow, oh; + int err = 0; + + for (iw = 8; iw < _rotate_max_w && !err; ++iw) + for (ih = 8; ih < _rotate_max_h && !err; ++ih) { + int i; + uint8 *input; + uint8 *output_0; + uint8 *output_90; + uint8 *output_180; + uint8 *output_270; + + ow = ih; + oh = iw; + + input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); + output_0 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); + output_90 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + output_180 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); + output_270 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + + for (i = 0; i < (iw * ih); ++i) + input[i] = i; + + RotatePlane270(input, iw, output_270, ow, iw, ih); + RotatePlane270(output_270, ow, output_180, oh, ow, oh); + RotatePlane270(output_180, oh, output_90, ow, oh, ow); + RotatePlane270(output_90, ow, output_0, iw, ow, oh); + + for (i = 0; i < (iw * ih); ++i) { + if (input[i] != output_0[i]) + err++; + } + + if (err) { + printf("input %dx%d \n", iw, ih); + print_array(input, iw, ih); + + printf("output 270\n"); + print_array(output_270, ow, oh); + + printf("output 180\n"); + print_array(output_180, iw, ih); + + printf("output 90\n"); + print_array(output_90, ow, oh); + + printf("output 0\n"); + print_array(output_0, iw, ih); + } + + free(input); + free(output_0); + free(output_90); + free(output_180); + free(output_270); + } + + EXPECT_EQ(0, err); +} + +TEST_F(libyuvTest, RotatePlane90and270) { + int iw, ih, ow, oh; + int err = 0; + + for (iw = 16; iw < _rotate_max_w && !err; iw += 4) + for (ih = 16; ih < _rotate_max_h && !err; ih += 4) { + int i; + uint8 *input; + uint8 *output_0; + uint8 *output_90; + ow = ih; + oh = iw; + + input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); + output_0 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); + output_90 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + + for (i = 0; i < (iw * ih); ++i) + input[i] = i; + + RotatePlane90(input, iw, output_90, ow, iw, ih); + RotatePlane270(output_90, ow, output_0, iw, ow, oh); + + for (i = 0; i < (iw * ih); ++i) { + if (input[i] != output_0[i]) + err++; + } + + if (err) { + printf("intput %dx%d\n", iw, ih); + print_array(input, iw, ih); + + printf("output \n"); + print_array(output_90, ow, oh); + + printf("output \n"); + print_array(output_0, iw, ih); + } + + free(input); + free(output_0); + free(output_90); + } + + EXPECT_EQ(0, err); +} + +TEST_F(libyuvTest, RotatePlane90Pitch) { + int iw, ih; + int err = 0; + + for (iw = 16; iw < _rotate_max_w && !err; iw += 4) + for (ih = 16; ih < _rotate_max_h && !err; ih += 4) { + int i; + uint8 *input; + uint8 *output_0; + uint8 *output_90; + int ow = ih; + int oh = iw; + + input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); + output_0 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); + output_90 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + + for (i = 0; i < (iw * ih); ++i) + input[i] = i; + + RotatePlane90(input, iw, + output_90 + (ow >> 1), ow, + iw >> 1, ih >> 1); + RotatePlane90(input + (iw >> 1), iw, + output_90 + (ow >> 1) + ow * (oh >> 1), ow, + iw >> 1, ih >> 1); + RotatePlane90(input + iw * (ih >> 1), iw, + output_90, ow, + iw >> 1, ih >> 1); + RotatePlane90(input + (iw >> 1) + iw * (ih >> 1), iw, + output_90 + ow * (oh >> 1), ow, + iw >> 1, ih >> 1); + + RotatePlane270(output_90, ih, output_0, iw, ow, oh); + + for (i = 0; i < (iw * ih); ++i) { + if (input[i] != output_0[i]) + err++; + } + + if (err) { + printf("intput %dx%d\n", iw, ih); + print_array(input, iw, ih); + + printf("output \n"); + print_array(output_90, ow, oh); + + printf("output \n"); + print_array(output_0, iw, ih); + } + + free(input); + free(output_0); + free(output_90); + } + + EXPECT_EQ(0, err); +} + +TEST_F(libyuvTest, RotatePlane270Pitch) { + int iw, ih, ow, oh; + int err = 0; + + for (iw = 16; iw < _rotate_max_w && !err; iw += 4) + for (ih = 16; ih < _rotate_max_h && !err; ih += 4) { + int i; + uint8 *input; + uint8 *output_0; + uint8 *output_270; + + ow = ih; + oh = iw; + + input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); + output_0 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8))); + output_270 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8))); + + for (i = 0; i < (iw * ih); ++i) + input[i] = i; + + RotatePlane270(input, iw, + output_270 + ow * (oh >> 1), ow, + iw >> 1, ih >> 1); + RotatePlane270(input + (iw >> 1), iw, + output_270, ow, + iw >> 1, ih >> 1); + RotatePlane270(input + iw * (ih >> 1), iw, + output_270 + (ow >> 1) + ow * (oh >> 1), ow, + iw >> 1, ih >> 1); + RotatePlane270(input + (iw >> 1) + iw * (ih >> 1), iw, + output_270 + (ow >> 1), ow, + iw >> 1, ih >> 1); + + RotatePlane90(output_270, ih, output_0, iw, ow, oh); + + for (i = 0; i < (iw * ih); ++i) { + if (input[i] != output_0[i]) + err++; + } + + if (err) { + printf("intput %dx%d\n", iw, ih); + print_array(input, iw, ih); + + printf("output \n"); + print_array(output_270, ow, oh); + + printf("output \n"); + print_array(output_0, iw, ih); + } + + free(input); + free(output_0); + free(output_270); + } + + EXPECT_EQ(0, err); +} + +TEST_F(libyuvTest, I420Rotate90) { + int err = 0; + uint8 *orig_y, *orig_u, *orig_v; + uint8 *ro0_y, *ro0_u, *ro0_v; + uint8 *ro90_y, *ro90_u, *ro90_v; + uint8 *ro270_y, *ro270_u, *ro270_v; + + int yw = 1024; + int yh = 768; + int b = 128; + int uvw = (yw + 1) >> 1; + int uvh = (yh + 1) >> 1; + + int i, j; + + int y_plane_size = (yw + (2 * b)) * (yh + (2 * b)); + int uv_plane_size = (uvw + (2 * b)) * (uvh + (2 * b)); + + srandom(time(NULL)); + + orig_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); + orig_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + orig_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + + ro0_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); + ro0_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + ro0_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + + ro90_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); + ro90_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + ro90_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + + ro270_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); + ro270_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + ro270_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + + // fill image buffers with random data + for (i = b; i < (yh + b); ++i) { + for (j = b; j < (yw + b); ++j) { + orig_y[i * (yw + (2 * b)) + j] = random() & 0xff; + } + } + + for (i = b; i < (uvh + b); ++i) { + for (j = b; j < (uvw + b); ++j) { + orig_u[i * (uvw + (2 * b)) + j] = random() & 0xff; + orig_v[i * (uvw + (2 * b)) + j] = random() & 0xff; + } + } + + int y_off_0 = b * (yw + (2 * b)) + b; + int uv_off_0 = b * (uvw + (2 * b)) + b; + int y_off_90 = b * (yh + (2 * b)) + b; + int uv_off_90 = b * (uvh + (2 * b)) + b; + + int y_st_0 = yw + (2 * b); + int uv_st_0 = uvw + (2 * b); + int y_st_90 = yh + (2 * b); + int uv_st_90 = uvh + (2 * b); + + I420Rotate(orig_y+y_off_0, y_st_0, + orig_u+uv_off_0, uv_st_0, + orig_v+uv_off_0, uv_st_0, + ro90_y+y_off_90, y_st_90, + ro90_u+uv_off_90, uv_st_90, + ro90_v+uv_off_90, uv_st_90, + yw, yh, + kRotateClockwise); + + I420Rotate(ro90_y+y_off_90, y_st_90, + ro90_u+uv_off_90, uv_st_90, + ro90_v+uv_off_90, uv_st_90, + ro270_y+y_off_90, y_st_90, + ro270_u+uv_off_90, uv_st_90, + ro270_v+uv_off_90, uv_st_90, + yh, yw, + kRotate180); + + I420Rotate(ro270_y+y_off_90, y_st_90, + ro270_u+uv_off_90, uv_st_90, + ro270_v+uv_off_90, uv_st_90, + ro0_y+y_off_0, y_st_0, + ro0_u+uv_off_0, uv_st_0, + ro0_v+uv_off_0, uv_st_0, + yh, yw, + kRotateClockwise); + + for (i = 0; i < y_plane_size; ++i) { + if (orig_y[i] != ro0_y[i]) + ++err; + } + + for (i = 0; i < uv_plane_size; ++i) { + if (orig_u[i] != ro0_u[i]) + ++err; + if (orig_v[i] != ro0_v[i]) + ++err; + } + + free(orig_y); + free(orig_u); + free(orig_v); + free(ro0_y); + free(ro0_u); + free(ro0_v); + free(ro90_y); + free(ro90_u); + free(ro90_v); + free(ro270_y); + free(ro270_u); + free(ro270_v); + + EXPECT_EQ(0, err); +} + +TEST_F(libyuvTest, I420Rotate270) { + int err = 0; + uint8 *orig_y, *orig_u, *orig_v; + uint8 *ro0_y, *ro0_u, *ro0_v; + uint8 *ro90_y, *ro90_u, *ro90_v; + uint8 *ro270_y, *ro270_u, *ro270_v; + + int yw = 1024; + int yh = 768; + int b = 128; + int uvw = (yw + 1) >> 1; + int uvh = (yh + 1) >> 1; + + int i, j; + + int y_plane_size = (yw + (2 * b)) * (yh + (2 * b)); + int uv_plane_size = (uvw + (2 * b)) * (uvh + (2 * b)); + + srandom(time(NULL)); + + orig_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); + orig_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + orig_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + + ro0_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); + ro0_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + ro0_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + + ro90_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); + ro90_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + ro90_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + + ro270_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); + ro270_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + ro270_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + + // fill image buffers with random data + for (i = b; i < (yh + b); ++i) { + for (j = b; j < (yw + b); ++j) { + orig_y[i * (yw + (2 * b)) + j] = random() & 0xff; + } + } + + for (i = b; i < (uvh + b); ++i) { + for (j = b; j < (uvw + b); ++j) { + orig_u[i * (uvw + (2 * b)) + j] = random() & 0xff; + orig_v[i * (uvw + (2 * b)) + j] = random() & 0xff; + } + } + + int y_off_0 = b * (yw + (2 * b)) + b; + int uv_off_0 = b * (uvw + (2 * b)) + b; + int y_off_90 = b * (yh + (2 * b)) + b; + int uv_off_90 = b * (uvh + (2 * b)) + b; + + int y_st_0 = yw + (2 * b); + int uv_st_0 = uvw + (2 * b); + int y_st_90 = yh + (2 * b); + int uv_st_90 = uvh + (2 * b); + + I420Rotate(orig_y+y_off_0, y_st_0, + orig_u+uv_off_0, uv_st_0, + orig_v+uv_off_0, uv_st_0, + ro270_y+y_off_90, y_st_90, + ro270_u+uv_off_90, uv_st_90, + ro270_v+uv_off_90, uv_st_90, + yw, yh, + kRotateCounterClockwise); + + I420Rotate(ro270_y+y_off_90, y_st_90, + ro270_u+uv_off_90, uv_st_90, + ro270_v+uv_off_90, uv_st_90, + ro90_y+y_off_90, y_st_90, + ro90_u+uv_off_90, uv_st_90, + ro90_v+uv_off_90, uv_st_90, + yh, yw, + kRotate180); + + I420Rotate(ro90_y+y_off_90, y_st_90, + ro90_u+uv_off_90, uv_st_90, + ro90_v+uv_off_90, uv_st_90, + ro0_y+y_off_0, y_st_0, + ro0_u+uv_off_0, uv_st_0, + ro0_v+uv_off_0, uv_st_0, + yh, yw, + kRotateCounterClockwise); + + for (i = 0; i < y_plane_size; ++i) { + if (orig_y[i] != ro0_y[i]) + ++err; + } + + for (i = 0; i < uv_plane_size; ++i) { + if (orig_u[i] != ro0_u[i]) + ++err; + if (orig_v[i] != ro0_v[i]) + ++err; + } + + free(orig_y); + free(orig_u); + free(orig_v); + free(ro0_y); + free(ro0_u); + free(ro0_v); + free(ro90_y); + free(ro90_u); + free(ro90_v); + free(ro270_y); + free(ro270_u); + free(ro270_v); + + EXPECT_EQ(0, err); +} + +TEST_F(libyuvTest, NV12ToI420Rotate90) { + int err = 0; + uint8 *orig_y, *orig_uv; + uint8 *ro0_y, *ro0_u, *ro0_v; + uint8 *ro90_y, *ro90_u, *ro90_v; + + int yw = 1024; + int yh = 768; + int b = 128; + int uvw = (yw + 1) >> 1; + int uvh = (yh + 1) >> 1; + int i, j; + + int y_plane_size = (yw + (2 * b)) * (yh + (2 * b)); + int uv_plane_size = (uvw + (2 * b)) * (uvh + (2 * b)); + int o_uv_plane_size = ((2 * uvw) + (2 * b)) * (uvh + (2 * b)); + + srandom(time(NULL)); + + orig_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); + orig_uv = static_cast<uint8*>(calloc(o_uv_plane_size, sizeof(uint8))); + + ro0_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); + ro0_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + ro0_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + + ro90_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); + ro90_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + ro90_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + + // fill image buffers with random data + for (i = b; i < (yh + b); ++i) { + for (j = b; j < (yw + b); ++j) { + orig_y[i * (yw + (2 * b)) + j] = random() & 0xff; + } + } + + for (i = b; i < (uvh + b); ++i) { + for (j = b; j < ((2 * uvw) + b); j += 2) { + uint8 random_number = random() & 0x7f; + orig_uv[i * ((2 * uvw) + (2 * b)) + j] = random_number; + orig_uv[i * ((2 * uvw) + (2 * b)) + j + 1] = -random_number; + } + } + + int y_off_0 = b * (yw + (2 * b)) + b; + int uv_off_0 = b * (uvw + (2 * b)) + b; + int y_off_90 = b * (yh + (2 * b)) + b; + int uv_off_90 = b * (uvh + (2 * b)) + b; + + int y_st_0 = yw + (2 * b); + int uv_st_0 = uvw + (2 * b); + int y_st_90 = yh + (2 * b); + int uv_st_90 = uvh + (2 * b); + + NV12ToI420Rotate(orig_y+y_off_0, y_st_0, + orig_uv+y_off_0, y_st_0, + ro90_y+y_off_90, y_st_90, + ro90_u+uv_off_90, uv_st_90, + ro90_v+uv_off_90, uv_st_90, + yw, yh, + kRotateClockwise); + + I420Rotate(ro90_y+y_off_90, y_st_90, + ro90_u+uv_off_90, uv_st_90, + ro90_v+uv_off_90, uv_st_90, + ro0_y+y_off_0, y_st_0, + ro0_u+uv_off_0, uv_st_0, + ro0_v+uv_off_0, uv_st_0, + yh, yw, + kRotateCounterClockwise); + + for (i = 0; i < y_plane_size; ++i) { + if (orig_y[i] != ro0_y[i]) + ++err; + } + + int zero_cnt = 0; + + for (i = 0; i < uv_plane_size; ++i) { + if ((signed char)ro0_u[i] != -(signed char)ro0_v[i]) + ++err; + if (ro0_u[i] != 0) + ++zero_cnt; + } + + if (!zero_cnt) + ++err; + + free(orig_y); + free(orig_uv); + free(ro0_y); + free(ro0_u); + free(ro0_v); + free(ro90_y); + free(ro90_u); + free(ro90_v); + + EXPECT_EQ(0, err); +} + +TEST_F(libyuvTest, NV12ToI420Rotate270) { + int err = 0; + uint8 *orig_y, *orig_uv; + uint8 *ro0_y, *ro0_u, *ro0_v; + uint8 *ro270_y, *ro270_u, *ro270_v; + + int yw = 1024; + int yh = 768; + int b = 128; + int uvw = (yw + 1) >> 1; + int uvh = (yh + 1) >> 1; + + int i, j; + + int y_plane_size = (yw + (2 * b)) * (yh + (2 * b)); + int uv_plane_size = (uvw + (2 * b)) * (uvh + (2 * b)); + int o_uv_plane_size = ((2 * uvw) + (2 * b)) * (uvh + (2 * b)); + + srandom(time(NULL)); + + orig_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); + orig_uv = static_cast<uint8*>(calloc(o_uv_plane_size, sizeof(uint8))); + + ro0_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); + ro0_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + ro0_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + + ro270_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); + ro270_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + ro270_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + + // fill image buffers with random data + for (i = b; i < (yh + b); ++i) { + for (j = b; j < (yw + b); ++j) { + orig_y[i * (yw + (2 * b)) + j] = random() & 0xff; + } + } + + for (i = b; i < (uvh + b); ++i) { + for (j = b; j < ((2 * uvw) + b); j += 2) { + uint8 random_number = random() & 0x7f; + orig_uv[i * ((2 * uvw) + (2 * b)) + j] = random_number; + orig_uv[i * ((2 * uvw) + (2 * b)) + j + 1] = -random_number; + } + } + + int y_off_0 = b * (yw + (2 * b)) + b; + int uv_off_0 = b * (uvw + (2 * b)) + b; + int y_off_270 = b * (yh + (2 * b)) + b; + int uv_off_270 = b * (uvh + (2 * b)) + b; + + int y_st_0 = yw + (2 * b); + int uv_st_0 = uvw + (2 * b); + int y_st_270 = yh + (2 * b); + int uv_st_270 = uvh + (2 * b); + + NV12ToI420Rotate(orig_y+y_off_0, y_st_0, + orig_uv+y_off_0, y_st_0, + ro270_y+y_off_270, y_st_270, + ro270_u+uv_off_270, uv_st_270, + ro270_v+uv_off_270, uv_st_270, + yw, yh, + kRotateCounterClockwise); + + I420Rotate(ro270_y+y_off_270, y_st_270, + ro270_u+uv_off_270, uv_st_270, + ro270_v+uv_off_270, uv_st_270, + ro0_y+y_off_0, y_st_0, + ro0_u+uv_off_0, uv_st_0, + ro0_v+uv_off_0, uv_st_0, + yh, yw, + kRotateClockwise); + + for (i = 0; i < y_plane_size; ++i) { + if (orig_y[i] != ro0_y[i]) + ++err; + } + + int zero_cnt = 0; + + for (i = 0; i < uv_plane_size; ++i) { + if ((signed char)ro0_u[i] != -(signed char)ro0_v[i]) + ++err; + if (ro0_u[i] != 0) + ++zero_cnt; + } + + if (!zero_cnt) + ++err; + + free(orig_y); + free(orig_uv); + free(ro0_y); + free(ro0_u); + free(ro0_v); + free(ro270_y); + free(ro270_u); + free(ro270_v); + + EXPECT_EQ(0, err); +} + +TEST_F(libyuvTest, NV12ToI420Rotate180) { + int err = 0; + uint8 *orig_y, *orig_uv; + uint8 *ro0_y, *ro0_u, *ro0_v; + uint8 *ro180_y, *ro180_u, *ro180_v; + + int yw = 1024; + int yh = 768; + int b = 128; + int uvw = (yw + 1) >> 1; + int uvh = (yh + 1) >> 1; + + int i, j; + + int y_plane_size = (yw + (2 * b)) * (yh + (2 * b)); + int uv_plane_size = (uvw + (2 * b)) * (uvh + (2 * b)); + int o_uv_plane_size = ((2 * uvw) + (2 * b)) * (uvh + (2 * b)); + + srandom(time(NULL)); + + orig_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); + orig_uv = static_cast<uint8*>(calloc(o_uv_plane_size, sizeof(uint8))); + + ro0_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); + ro0_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + ro0_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + + ro180_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); + ro180_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + ro180_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + + // fill image buffers with random data + for (i = b; i < (yh + b); ++i) { + for (j = b; j < (yw + b); ++j) { + orig_y[i * (yw + (2 * b)) + j] = random() & 0xff; + } + } + + for (i = b; i < (uvh + b); ++i) { + for (j = b; j < ((2 * uvw) + b); j += 2) { + uint8 random_number = random() & 0x7f; + orig_uv[i * ((2 * uvw) + (2 * b)) + j] = random_number; + orig_uv[i * ((2 * uvw) + (2 * b)) + j + 1] = -random_number; + } + } + + int y_off = b * (yw + (2 * b)) + b; + int uv_off = b * (uvw + (2 * b)) + b; + + int y_st = yw + (2 * b); + int uv_st = uvw + (2 * b); + + NV12ToI420Rotate(orig_y+y_off, y_st, + orig_uv+y_off, y_st, + ro180_y+y_off, y_st, + ro180_u+uv_off, uv_st, + ro180_v+uv_off, uv_st, + yw, yh, + kRotate180); + + I420Rotate(ro180_y+y_off, y_st, + ro180_u+uv_off, uv_st, + ro180_v+uv_off, uv_st, + ro0_y+y_off, y_st, + ro0_u+uv_off, uv_st, + ro0_v+uv_off, uv_st, + yw, yh, + kRotate180); + + for (i = 0; i < y_plane_size; ++i) { + if (orig_y[i] != ro0_y[i]) + ++err; + } + + int zero_cnt = 0; + + for (i = 0; i < uv_plane_size; ++i) { + if ((signed char)ro0_u[i] != -(signed char)ro0_v[i]) + ++err; + if (ro0_u[i] != 0) + ++zero_cnt; + } + + if (!zero_cnt) + ++err; + + free(orig_y); + free(orig_uv); + free(ro0_y); + free(ro0_u); + free(ro0_v); + free(ro180_y); + free(ro180_u); + free(ro180_v); + + EXPECT_EQ(0, err); +} + +TEST_F(libyuvTest, NV12ToI420RotateNegHeight90) { + int y_err = 0, uv_err = 0; + uint8 *orig_y, *orig_uv; + uint8 *roa_y, *roa_u, *roa_v; + uint8 *rob_y, *rob_u, *rob_v; + uint8 *roc_y, *roc_u, *roc_v; + + int yw = 1024; + int yh = 768; + int b = 128; + int uvw = (yw + 1) >> 1; + int uvh = (yh + 1) >> 1; + int i, j; + + int y_plane_size = (yw + (2 * b)) * (yh + (2 * b)); + int uv_plane_size = (uvw + (2 * b)) * (uvh + (2 * b)); + int o_uv_plane_size = ((2 * uvw) + (2 * b)) * (uvh + (2 * b)); + + srandom(time(NULL)); + + orig_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); + orig_uv = static_cast<uint8*>(calloc(o_uv_plane_size, sizeof(uint8))); + + roa_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); + roa_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + roa_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + + rob_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); + rob_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + rob_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + + roc_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); + roc_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + roc_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + + // fill image buffers with random data + for (i = b; i < (yh + b); ++i) { + for (j = b; j < (yw + b); ++j) { + orig_y[i * (yw + (2 * b)) + j] = random() & 0xff; + } + } + + for (i = b; i < (uvh + b); ++i) { + for (j = b; j < ((2 * uvw) + b); j += 2) { + uint8 random_number = random() & 0x7f; + orig_uv[i * ((2 * uvw) + (2 * b)) + j] = random_number; + orig_uv[i * ((2 * uvw) + (2 * b)) + j + 1] = -random_number; + } + } + + int y_off_0 = b * (yw + (2 * b)) + b; + int uv_off_0 = b * (uvw + (2 * b)) + b; + int y_off_90 = b * (yh + (2 * b)) + b; + int uv_off_90 = b * (uvh + (2 * b)) + b; + + int y_st_0 = yw + (2 * b); + int uv_st_0 = uvw + (2 * b); + int y_st_90 = yh + (2 * b); + int uv_st_90 = uvh + (2 * b); + + NV12ToI420Rotate(orig_y+y_off_0, y_st_0, + orig_uv+y_off_0, y_st_0, + roa_y+y_off_90, y_st_90, + roa_u+uv_off_90, uv_st_90, + roa_v+uv_off_90, uv_st_90, + yw, -yh, + kRotateClockwise); + + I420Rotate(roa_y+y_off_90, y_st_90, + roa_u+uv_off_90, uv_st_90, + roa_v+uv_off_90, uv_st_90, + rob_y+y_off_0, y_st_0, + rob_u+uv_off_0, uv_st_0, + rob_v+uv_off_0, uv_st_0, + yh, -yw, + kRotateCounterClockwise); + + I420Rotate(rob_y+y_off_0, y_st_0, + rob_u+uv_off_0, uv_st_0, + rob_v+uv_off_0, uv_st_0, + roc_y+y_off_0, y_st_0, + roc_u+uv_off_0, uv_st_0, + roc_v+uv_off_0, uv_st_0, + yw, yh, + kRotate180); + + for (i = 0; i < y_plane_size; ++i) { + if (orig_y[i] != roc_y[i]) + ++y_err; + } + + if (y_err) { + printf("input %dx%d \n", yw, yh); + print_array(orig_y, y_st_0, yh + (2 * b)); + + printf("rotate a\n"); + print_array(roa_y, y_st_90, y_st_0); + + printf("rotate b\n"); + print_array(rob_y, y_st_90, y_st_0); + + printf("rotate c\n"); + print_array(roc_y, y_st_0, y_st_90); + } + + int zero_cnt = 0; + + for (i = 0; i < uv_plane_size; ++i) { + if ((signed char)roc_u[i] != -(signed char)roc_v[i]) + ++uv_err; + if (rob_u[i] != 0) + ++zero_cnt; + } + + if (!zero_cnt) + ++uv_err; + + if (uv_err) { + printf("input %dx%d \n", (2 * uvw), uvh); + print_array(orig_uv, y_st_0, uvh + (2 * b)); + + printf("rotate a\n"); + print_array(roa_u, uv_st_90, uv_st_0); + print_array(roa_v, uv_st_90, uv_st_0); + + printf("rotate b\n"); + print_array(rob_u, uv_st_90, uv_st_0); + print_array(rob_v, uv_st_90, uv_st_0); + + printf("rotate c\n"); + print_array(roc_u, uv_st_0, uv_st_90); + print_array(roc_v, uv_st_0, uv_st_90); + } + + free(orig_y); + free(orig_uv); + free(roa_y); + free(roa_u); + free(roa_v); + free(rob_y); + free(rob_u); + free(rob_v); + free(roc_y); + free(roc_u); + free(roc_v); + + EXPECT_EQ(0, y_err + uv_err); +} + +TEST_F(libyuvTest, NV12ToI420RotateNegHeight180) { + int y_err = 0, uv_err = 0; + uint8 *orig_y, *orig_uv; + uint8 *roa_y, *roa_u, *roa_v; + uint8 *rob_y, *rob_u, *rob_v; + + int yw = 1024; + int yh = 768; + int b = 128; + int uvw = (yw + 1) >> 1; + int uvh = (yh + 1) >> 1; + int i, j; + + int y_plane_size = (yw + (2 * b)) * (yh + (2 * b)); + int uv_plane_size = (uvw + (2 * b)) * (uvh + (2 * b)); + int o_uv_plane_size = ((2 * uvw) + (2 * b)) * (uvh + (2 * b)); + + srandom(time(NULL)); + + orig_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); + orig_uv = static_cast<uint8*>(calloc(o_uv_plane_size, sizeof(uint8))); + + roa_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); + roa_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + roa_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + + rob_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8))); + rob_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + rob_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8))); + + // fill image buffers with random data + for (i = b; i < (yh + b); ++i) { + for (j = b; j < (yw + b); ++j) { + orig_y[i * (yw + (2 * b)) + j] = random() & 0xff; + } + } + + for (i = b; i < (uvh + b); ++i) { + for (j = b; j < ((2 * uvw) + b); j += 2) { + uint8 random_number = random() & 0x7f; + orig_uv[i * ((2 * uvw) + (2 * b)) + j] = random_number; + orig_uv[i * ((2 * uvw) + (2 * b)) + j + 1] = -random_number; + } + } + + int y_off = b * (yw + (2 * b)) + b; + int uv_off = b * (uvw + (2 * b)) + b; + + int y_st = yw + (2 * b); + int uv_st = uvw + (2 * b); + + NV12ToI420Rotate(orig_y+y_off, y_st, + orig_uv+y_off, y_st, + roa_y+y_off, y_st, + roa_u+uv_off, uv_st, + roa_v+uv_off, uv_st, + yw, -yh, + kRotate180); + + I420Rotate(roa_y+y_off, y_st, + roa_u+uv_off, uv_st, + roa_v+uv_off, uv_st, + rob_y+y_off, y_st, + rob_u+uv_off, uv_st, + rob_v+uv_off, uv_st, + yw, -yh, + kRotate180); + + for (i = 0; i < y_plane_size; ++i) { + if (orig_y[i] != rob_y[i]) + ++y_err; + } + + if (y_err) { + printf("input %dx%d \n", yw, yh); + print_array(orig_y, y_st, yh + (2 * b)); + + printf("rotate a\n"); + print_array(roa_y, y_st, yh + (2 * b)); + + printf("rotate b\n"); + print_array(rob_y, y_st, yh + (2 * b)); + } + + int zero_cnt = 0; + + for (i = 0; i < uv_plane_size; ++i) { + if ((signed char)rob_u[i] != -(signed char)rob_v[i]) + ++uv_err; + if (rob_u[i] != 0) + ++zero_cnt; + } + + if (!zero_cnt) + ++uv_err; + + if (uv_err) { + printf("input %dx%d \n", (2 * uvw), uvh); + print_array(orig_uv, y_st, uvh + (2 * b)); + + printf("rotate a\n"); + print_array(roa_u, uv_st, uvh + (2 * b)); + print_array(roa_v, uv_st, uvh + (2 * b)); + + printf("rotate b\n"); + print_array(rob_u, uv_st, uvh + (2 * b)); + print_array(rob_v, uv_st, uvh + (2 * b)); + } + + free(orig_y); + free(orig_uv); + free(roa_y); + free(roa_u); + free(roa_v); + free(rob_y); + free(rob_u); + free(rob_v); + + EXPECT_EQ(0, y_err + uv_err); +} diff --git a/files/unit_test/scale_test.cc b/files/unit_test/scale_test.cc new file mode 100644 index 00000000..e147d78b --- /dev/null +++ b/files/unit_test/scale_test.cc @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/scale.h" +#include "unit_test.h" +#include <stdlib.h> +#include <time.h> + +using namespace libyuv; + +#define align_buffer_16(var, size) \ + uint8 *var; \ + uint8 *var##_mem; \ + var##_mem = reinterpret_cast<uint8*>(calloc(size+15, sizeof(uint8))); \ + var = reinterpret_cast<uint8*> \ + ((reinterpret_cast<intptr_t>(var##_mem) + 15) & (~0x0f)); + +#define free_aligned_buffer_16(var) \ + free(var##_mem); \ + var = 0; + +TEST_F(libyuvTest, ScaleDownBy4) { + int b = 128; + int src_width = 1280; + int src_height = 720; + int src_width_uv = (src_width + 1) >> 1; + int src_height_uv = (src_height + 1) >> 1; + + int src_y_plane_size = (src_width + (2 * b)) * (src_height + (2 * b)); + int src_uv_plane_size = (src_width_uv + (2 * b)) * (src_height_uv + (2 * b)); + + int src_stride_y = 2 * b + src_width; + int src_stride_uv = 2 * b + src_width_uv; + + align_buffer_16(src_y, src_y_plane_size) + align_buffer_16(src_u, src_uv_plane_size) + align_buffer_16(src_v, src_uv_plane_size) + + int dst_width = src_width >> 2; + int dst_height = src_height >> 2; + + int dst_width_uv = (dst_width + 1) >> 1; + int dst_height_uv = (dst_height + 1) >> 1; + + int dst_y_plane_size = (dst_width + (2 * b)) * (dst_height + (2 * b)); + int dst_uv_plane_size = (dst_width_uv + (2 * b)) * (dst_height_uv + (2 * b)); + + int dst_stride_y = 2 * b + dst_width; + int dst_stride_uv = 2 * b + dst_width_uv; + + align_buffer_16(dst_y, dst_y_plane_size) + align_buffer_16(dst_u, dst_uv_plane_size) + align_buffer_16(dst_v, dst_uv_plane_size) + + // create an image with random data reoccurring in 4x4 grid. When the image + // is filtered all the values should be the same. + srandom(time(NULL)); + + uint8 block_data[16]; + + int i, j; + + // Pulling 16 random numbers there is an infinitesimally small + // chance that they are all 0. Then the output will be all 0. + // Output buffer is filled with 0, want to make sure that after the + // filtering something went into the output buffer. + // Avoid this by setting one of the values to 128. Also set the + // random data to at least 1 for when point sampling to prevent + // output all being 0. + block_data[0] = 128; + + for (i = 1; i < 16; i++) + block_data[i] = (random() & 0xfe) + 1; + + for (i = b; i < (src_height + b); i += 4) { + for (j = b; j < (src_width + b); j += 4) { + uint8 *ptr = src_y + (i * src_stride_y) + j; + int k, l; + for (k = 0; k < 4; ++k) + for (l = 0; l < 4; ++l) + ptr[k + src_stride_y * l] = block_data[k + 4 * l]; + } + } + + for (i = 1; i < 16; i++) + block_data[i] = (random() & 0xfe) + 1; + + for (i = b; i < (src_height_uv + b); i += 4) { + for (j = b; j < (src_width_uv + b); j += 4) { + uint8 *ptru = src_u + (i * src_stride_uv) + j; + uint8 *ptrv = src_v + (i * src_stride_uv) + j; + int k, l; + for (k = 0; k < 4; ++k) + for (l = 0; l < 4; ++l) { + ptru[k + src_stride_uv * l] = block_data[k + 4 * l]; + ptrv[k + src_stride_uv * l] = block_data[k + 4 * l]; + } + } + } + + int f; + int err = 0; + + // currently three filter modes, defined as FilterMode in scale.h + for (f = 0; f < 3; ++f) { + I420Scale(src_y + (src_stride_y * b) + b, src_stride_y, + src_u + (src_stride_uv * b) + b, src_stride_uv, + src_v + (src_stride_uv * b) + b, src_stride_uv, + src_width, src_height, + dst_y + (dst_stride_y * b) + b, dst_stride_y, + dst_u + (dst_stride_uv * b) + b, dst_stride_uv, + dst_v + (dst_stride_uv * b) + b, dst_stride_uv, + dst_width, dst_height, + static_cast<FilterMode>(f)); + + int value = dst_y[(dst_stride_y * b) + b]; + + // catch the case that the output buffer is all 0 + if (value == 0) + ++err; + + for (i = b; i < (dst_height + b); ++i) { + for (j = b; j < (dst_width + b); ++j) { + if (value != dst_y[(i * dst_stride_y) + j]) + ++err; + } + } + + value = dst_u[(dst_stride_uv * b) + b]; + + if (value == 0) + ++err; + + for (i = b; i < (dst_height_uv + b); ++i) { + for (j = b; j < (dst_width_uv + b); ++j) { + if (value != dst_u[(i * dst_stride_uv) + j]) + ++err; + if (value != dst_v[(i * dst_stride_uv) + j]) + ++err; + } + } + } + + free_aligned_buffer_16(src_y) + free_aligned_buffer_16(src_u) + free_aligned_buffer_16(src_v) + free_aligned_buffer_16(dst_y) + free_aligned_buffer_16(dst_u) + free_aligned_buffer_16(dst_v) + + EXPECT_EQ(0, err); +} diff --git a/files/unit_test/unit_test.cc b/files/unit_test/unit_test.cc new file mode 100644 index 00000000..1996adf1 --- /dev/null +++ b/files/unit_test/unit_test.cc @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <cstring> +#include "unit_test.h" + +class libyuvEnvironment : public ::testing::Environment { + public: + virtual void SetUp() { + } + + virtual void TearDown() { + } +}; + +libyuvTest::libyuvTest() : + _rotate_max_w(128), + _rotate_max_h(128) { +} + +void libyuvTest::SetUp() { +} + +void libyuvTest::TearDown() { +} + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + libyuvEnvironment* env = new libyuvEnvironment; + ::testing::AddGlobalTestEnvironment(env); + + return RUN_ALL_TESTS(); +}
\ No newline at end of file diff --git a/files/unit_test/unit_test.h b/files/unit_test/unit_test.h new file mode 100644 index 00000000..cac30c72 --- /dev/null +++ b/files/unit_test/unit_test.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef UINIT_TEST_H_ +#define UINIT_TEST_H_ + +#include <gtest/gtest.h> + +class libyuvTest : public ::testing::Test { + protected: + libyuvTest(); + virtual void SetUp(); + virtual void TearDown(); + + const int _rotate_max_w; + const int _rotate_max_h; + +}; + +#endif // UNIT_TEST_H_ diff --git a/public.mk b/public.mk new file mode 100644 index 00000000..259ece21 --- /dev/null +++ b/public.mk @@ -0,0 +1,13 @@ +# This file contains all the common make variables which are useful for +# anyone depending on this library. +# Note that dependencies on NDK are not directly listed since NDK auto adds +# them. + +LIBYUV_INCLUDES := $(LIBYUV_PATH)/files/include + +LIBYUV_C_FLAGS := + +LIBYUV_CPP_FLAGS := + +LIBYUV_LDLIBS := +LIBYUV_DEP_MODULES := diff --git a/setup_env.bat b/setup_env.bat new file mode 100755 index 00000000..d8a33a68 --- /dev/null +++ b/setup_env.bat @@ -0,0 +1,7 @@ +:: This script must not rely on any external tools or PATH values. +@echo OFF + +if "%SETUP_ENV_LIBYUV_TOOLS%"=="done" goto :EOF +set SETUP_ENV_LIBYUV_TOOLS=done + +:: TODO(fbarchard): add files\win32 to for psnr tool |