summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorfbarchard@google.com <fbarchard@google.com>2014-10-07 00:59:31 +0000
committerfbarchard@google.com <fbarchard@google.com>2014-10-07 00:59:31 +0000
commitca308327d234668d15ff85a1798e1d0c12a3b39c (patch)
tree281f243f1ae7a1900d32a006f83f76bd2cd97a1e
parent76301c93298c19bfac0b4784706580beace14be5 (diff)
downloadlibyuv-ca308327d234668d15ff85a1798e1d0c12a3b39c.tar.gz
Remove unaligned functions, since most function support unaligned memory now. This reduces complexity and improves performance for unaligned cases because C code can be avoided, and overhead is less. Downside is old cpus (core2 and earlier) will be slower for aligned memory case. Except mips, which has alignment requirement, but remove unaligned variant.
BUG=365 TESTED=unittest builds and passes locally R=harryjin@google.com Review URL: https://webrtc-codereview.appspot.com/24839004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1113 16f28f9a-4ce2-e073-06de-1de4eb20be90
-rw-r--r--include/libyuv/row.h89
-rw-r--r--source/convert.cc123
-rw-r--r--source/convert_argb.cc65
-rw-r--r--source/convert_from.cc27
-rw-r--r--source/convert_from_argb.cc109
-rw-r--r--source/format_conversion.cc8
-rw-r--r--source/rotate_argb.cc14
-rw-r--r--source/row_any.cc66
-rw-r--r--source/row_common.cc26
-rw-r--r--source/row_mips.cc83
-rw-r--r--source/row_posix.cc1422
-rw-r--r--source/row_win.cc1362
-rw-r--r--source/scale.cc48
-rw-r--r--source/scale_argb.cc32
14 files changed, 140 insertions, 3334 deletions
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 885f8de..f5abf1e 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -655,13 +655,6 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix);
void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix);
void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix);
void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int pix);
-void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix);
-void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix);
-void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix);
-void RGB24ToYRow_Unaligned_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix);
-void RAWToYRow_Unaligned_SSSE3(const uint8* src_raw, uint8* dst_y, int pix);
void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix);
void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix);
void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
@@ -740,16 +733,6 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr, int src_stride_abgr,
uint8* dst_u, uint8* dst_v, int width);
void RGBAToUVRow_SSSE3(const uint8* src_rgba, int src_stride_rgba,
uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra, int src_stride_bgra,
- uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr, int src_stride_abgr,
- uint8* dst_u, uint8* dst_v, int width);
-void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba, int src_stride_rgba,
- uint8* dst_u, uint8* dst_v, int width);
void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
@@ -811,15 +794,11 @@ void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
void ARGBToUV444Row_SSSE3(const uint8* src_argb,
uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb,
- uint8* dst_u, uint8* dst_v, int width);
void ARGBToUV444Row_Any_SSSE3(const uint8* src_argb,
uint8* dst_u, uint8* dst_v, int width);
void ARGBToUV422Row_SSSE3(const uint8* src_argb,
uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb,
- uint8* dst_u, uint8* dst_v, int width);
void ARGBToUV422Row_Any_SSSE3(const uint8* src_argb,
uint8* dst_u, uint8* dst_v, int width);
@@ -857,10 +836,6 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int pix);
-void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int pix);
-void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,
- uint8* dst_v, int pix);
void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int pix);
void SplitUVRow_Any_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
@@ -878,8 +853,6 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width);
void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width);
-void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
- uint8* dst_uv, int width);
void MergeUVRow_Any_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width);
void MergeUVRow_Any_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
@@ -926,8 +899,6 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix);
void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix);
-void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int pix);
void ARGBShuffleRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix);
void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
@@ -994,7 +965,6 @@ void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
-void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
@@ -1152,7 +1122,6 @@ void I422ToRGB565Row_SSSE3(const uint8* src_y,
const uint8* src_v,
uint8* dst_argb,
int width);
-// RGB24/RAW are unaligned.
void I422ToRGB24Row_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
@@ -1163,51 +1132,6 @@ void I422ToRAWRow_SSSE3(const uint8* src_y,
const uint8* src_v,
uint8* dst_raw,
int width);
-
-void I444ToARGBRow_Unaligned_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- int width);
-void I422ToARGBRow_Unaligned_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- int width);
-void I411ToARGBRow_Unaligned_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- int width);
-void NV12ToARGBRow_Unaligned_SSSE3(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_argb,
- int width);
-void NV21ToARGBRow_Unaligned_SSSE3(const uint8* src_y,
- const uint8* src_vu,
- uint8* dst_argb,
- int width);
-void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2,
- uint8* dst_argb,
- int width);
-void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy,
- uint8* dst_argb,
- int width);
-void I422ToBGRARow_Unaligned_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_bgra,
- int width);
-void I422ToABGRRow_Unaligned_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_abgr,
- int width);
-void I422ToRGBARow_Unaligned_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgba,
- int width);
void I422ToARGBRow_Any_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
@@ -1280,7 +1204,6 @@ void I422ToRGB565Row_Any_SSSE3(const uint8* src_y,
const uint8* src_v,
uint8* dst_rgba,
int width);
-// RGB24/RAW are unaligned.
void I422ToRGB24Row_Any_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
@@ -1494,12 +1417,6 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix);
void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
uint8* dst_u, uint8* dst_v, int pix);
-void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
- uint8* dst_y, int pix);
-void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int pix);
-void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int pix);
void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix);
void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix);
@@ -1535,12 +1452,6 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_v, int pix);
void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
uint8* dst_u, uint8* dst_v, int pix);
-void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
- uint8* dst_y, int pix);
-void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int pix);
-void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int pix);
void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);
void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_v, int pix);
diff --git a/source/convert.cc b/source/convert.cc
index f205143..19ad659 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -194,10 +194,7 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
}
#endif
#if defined(HAS_COPYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
- IS_ALIGNED(src, 16) &&
- IS_ALIGNED(src_stride_0, 16) && IS_ALIGNED(src_stride_1, 16) &&
- IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_SSE2;
}
#endif
@@ -291,12 +288,7 @@ static int X420ToI420(const uint8* src_y,
if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
SplitUVRow = SplitUVRow_Any_SSE2;
if (IS_ALIGNED(halfwidth, 16)) {
- SplitUVRow = SplitUVRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_uv, 16) && IS_ALIGNED(src_stride_uv, 16) &&
- IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
- IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
- SplitUVRow = SplitUVRow_SSE2;
- }
+ SplitUVRow = SplitUVRow_SSE2;
}
}
#endif
@@ -317,15 +309,13 @@ static int X420ToI420(const uint8* src_y,
}
#endif
#if defined(HAS_SPLITUVROW_MIPS_DSPR2)
- if (TestCpuFlag(kCpuHasMIPS_DSPR2) && halfwidth >= 16) {
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && halfwidth >= 16 &&
+ IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) &&
+ IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) &&
+ IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) {
SplitUVRow = SplitUVRow_Any_MIPS_DSPR2;
if (IS_ALIGNED(halfwidth, 16)) {
- SplitUVRow = SplitUVRow_Unaligned_MIPS_DSPR2;
- if (IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) &&
- IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) &&
- IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) {
- SplitUVRow = SplitUVRow_MIPS_DSPR2;
- }
+ SplitUVRow = SplitUVRow_MIPS_DSPR2;
}
}
#endif
@@ -440,9 +430,7 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
}
#endif
#if defined(HAS_COPYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
- IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
- IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_SSE2;
}
#endif
@@ -467,14 +455,8 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
YUY2ToYRow = YUY2ToYRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
- YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
- YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
- YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- YUY2ToYRow = YUY2ToYRow_SSE2;
- }
- }
+ YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
+ YUY2ToYRow = YUY2ToYRow_SSE2;
}
}
#endif
@@ -543,14 +525,8 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
YUY2ToUVRow = YUY2ToUVRow_Any_SSE2;
YUY2ToYRow = YUY2ToYRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
- YUY2ToUVRow = YUY2ToUVRow_Unaligned_SSE2;
- YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
- YUY2ToUVRow = YUY2ToUVRow_SSE2;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- YUY2ToYRow = YUY2ToYRow_SSE2;
- }
- }
+ YUY2ToUVRow = YUY2ToUVRow_SSE2;
+ YUY2ToYRow = YUY2ToYRow_SSE2;
}
}
#endif
@@ -616,14 +592,8 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
UYVYToUVRow = UYVYToUVRow_Any_SSE2;
UYVYToYRow = UYVYToYRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
- UYVYToUVRow = UYVYToUVRow_Unaligned_SSE2;
- UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
- UYVYToUVRow = UYVYToUVRow_SSE2;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- UYVYToYRow = UYVYToYRow_SSE2;
- }
- }
+ UYVYToUVRow = UYVYToUVRow_SSE2;
+ UYVYToYRow = UYVYToYRow_SSE2;
}
}
#endif
@@ -694,14 +664,8 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
- }
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
@@ -776,14 +740,8 @@ int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
BGRAToYRow = BGRAToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- BGRAToUVRow = BGRAToUVRow_Unaligned_SSSE3;
- BGRAToYRow = BGRAToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16)) {
- BGRAToUVRow = BGRAToUVRow_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- BGRAToYRow = BGRAToYRow_SSSE3;
- }
- }
+ BGRAToUVRow = BGRAToUVRow_SSSE3;
+ BGRAToYRow = BGRAToYRow_SSSE3;
}
}
#elif defined(HAS_BGRATOYROW_NEON)
@@ -847,14 +805,8 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
ABGRToYRow = ABGRToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ABGRToUVRow = ABGRToUVRow_Unaligned_SSSE3;
- ABGRToYRow = ABGRToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_abgr, 16) && IS_ALIGNED(src_stride_abgr, 16)) {
- ABGRToUVRow = ABGRToUVRow_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ABGRToYRow = ABGRToYRow_SSSE3;
- }
- }
+ ABGRToUVRow = ABGRToUVRow_SSSE3;
+ ABGRToYRow = ABGRToYRow_SSSE3;
}
}
#elif defined(HAS_ABGRTOYROW_NEON)
@@ -918,14 +870,8 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
RGBAToYRow = RGBAToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- RGBAToUVRow = RGBAToUVRow_Unaligned_SSSE3;
- RGBAToYRow = RGBAToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_rgba, 16) && IS_ALIGNED(src_stride_rgba, 16)) {
- RGBAToUVRow = RGBAToUVRow_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- RGBAToYRow = RGBAToYRow_SSSE3;
- }
- }
+ RGBAToUVRow = RGBAToUVRow_SSSE3;
+ RGBAToYRow = RGBAToYRow_SSSE3;
}
}
#elif defined(HAS_RGBATOYROW_NEON)
@@ -1029,10 +975,7 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
+ ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif // HAS_ARGBTOUVROW_SSSE3
@@ -1146,10 +1089,7 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
+ ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif // HAS_ARGBTOUVROW_SSSE3
@@ -1260,10 +1200,7 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
+ ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif // HAS_ARGBTOUVROW_SSSE3
@@ -1377,10 +1314,7 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
+ ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif // HAS_ARGBTOUVROW_SSSE3
@@ -1495,10 +1429,7 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
+ ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif // HAS_ARGBTOUVROW_SSSE3
diff --git a/source/convert_argb.cc b/source/convert_argb.cc
index ac0bc3d..3d8250c 100644
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -82,10 +82,7 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- I444ToARGBRow = I444ToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- I444ToARGBRow = I444ToARGBRow_SSSE3;
- }
+ I444ToARGBRow = I444ToARGBRow_SSSE3;
}
}
#elif defined(HAS_I444TOARGBROW_NEON)
@@ -144,10 +141,7 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- I422ToARGBRow = I422ToARGBRow_SSSE3;
- }
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
}
}
#endif
@@ -224,10 +218,7 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I411ToARGBRow = I411ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- I411ToARGBRow = I411ToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- I411ToARGBRow = I411ToARGBRow_SSSE3;
- }
+ I411ToARGBRow = I411ToARGBRow_SSSE3;
}
}
#elif defined(HAS_I411TOARGBROW_NEON)
@@ -276,8 +267,7 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
src_stride_y = dst_stride_argb = 0;
}
#if defined(HAS_YTOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
YToARGBRow = YToARGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
YToARGBRow = YToARGBRow_SSE2;
@@ -329,10 +319,7 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
I400ToARGBRow = I400ToARGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
- I400ToARGBRow = I400ToARGBRow_Unaligned_SSE2;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- I400ToARGBRow = I400ToARGBRow_SSE2;
- }
+ I400ToARGBRow = I400ToARGBRow_SSE2;
}
}
#elif defined(HAS_I400TOARGBROW_NEON)
@@ -447,8 +434,7 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
src_stride_rgb24 = dst_stride_argb = 0;
}
#if defined(HAS_RGB24TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
@@ -497,8 +483,7 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw,
src_stride_raw = dst_stride_argb = 0;
}
#if defined(HAS_RAWTOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
RAWToARGBRow = RAWToARGBRow_SSSE3;
@@ -547,8 +532,7 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
src_stride_rgb565 = dst_stride_argb = 0;
}
#if defined(HAS_RGB565TOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
@@ -597,8 +581,7 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
src_stride_argb1555 = dst_stride_argb = 0;
}
#if defined(HAS_ARGB1555TOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
@@ -647,8 +630,7 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
src_stride_argb4444 = dst_stride_argb = 0;
}
#if defined(HAS_ARGB4444TOARGBROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
@@ -696,10 +678,7 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- NV12ToARGBRow = NV12ToARGBRow_SSSE3;
- }
+ NV12ToARGBRow = NV12ToARGBRow_SSSE3;
}
}
#elif defined(HAS_NV12TOARGBROW_NEON)
@@ -747,10 +726,7 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- NV21ToARGBRow = NV21ToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- NV21ToARGBRow = NV21ToARGBRow_SSSE3;
- }
+ NV21ToARGBRow = NV21ToARGBRow_SSSE3;
}
}
#endif
@@ -798,10 +774,7 @@ int M420ToARGB(const uint8* src_m420, int src_stride_m420,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- NV12ToARGBRow = NV12ToARGBRow_SSSE3;
- }
+ NV12ToARGBRow = NV12ToARGBRow_SSSE3;
}
}
#elif defined(HAS_NV12TOARGBROW_NEON)
@@ -856,11 +829,7 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- YUY2ToARGBRow = YUY2ToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- YUY2ToARGBRow = YUY2ToARGBRow_SSSE3;
- }
+ YUY2ToARGBRow = YUY2ToARGBRow_SSSE3;
}
}
#elif defined(HAS_YUY2TOARGBROW_NEON)
@@ -909,11 +878,7 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- UYVYToARGBRow = UYVYToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- UYVYToARGBRow = UYVYToARGBRow_SSSE3;
- }
+ UYVYToARGBRow = UYVYToARGBRow_SSSE3;
}
}
#elif defined(HAS_UYVYTOARGBROW_NEON)
diff --git a/source/convert_from.cc b/source/convert_from.cc
index c1a2f62..068f14c 100644
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -400,12 +400,7 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
MergeUVRow_ = MergeUVRow_Any_SSE2;
if (IS_ALIGNED(halfwidth, 16)) {
- MergeUVRow_ = MergeUVRow_Unaligned_SSE2;
- if (IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
- IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
- IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
- MergeUVRow_ = MergeUVRow_SSE2;
- }
+ MergeUVRow_ = MergeUVRow_SSE2;
}
}
#endif
@@ -479,10 +474,7 @@ int I420ToARGB(const uint8* src_y, int src_stride_y,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- I422ToARGBRow = I422ToARGBRow_SSSE3;
- }
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
}
}
#endif
@@ -551,10 +543,7 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {
- I422ToBGRARow = I422ToBGRARow_SSSE3;
- }
+ I422ToBGRARow = I422ToBGRARow_SSSE3;
}
}
#elif defined(HAS_I422TOBGRAROW_NEON)
@@ -613,10 +602,7 @@ int I420ToABGR(const uint8* src_y, int src_stride_y,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {
- I422ToABGRRow = I422ToABGRRow_SSSE3;
- }
+ I422ToABGRRow = I422ToABGRRow_SSSE3;
}
}
#elif defined(HAS_I422TOABGRROW_NEON)
@@ -667,10 +653,7 @@ int I420ToRGBA(const uint8* src_y, int src_stride_y,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3;
- if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {
- I422ToRGBARow = I422ToRGBARow_SSSE3;
- }
+ I422ToRGBARow = I422ToRGBARow_SSSE3;
}
}
#elif defined(HAS_I422TORGBAROW_NEON)
diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc
index de461dd..b0f9b1e 100644
--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@@ -54,10 +54,7 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUV444Row = ARGBToUV444Row_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToUV444Row = ARGBToUV444Row_SSSE3;
- }
+ ARGBToUV444Row = ARGBToUV444Row_SSSE3;
}
}
#elif defined(HAS_ARGBTOUV444ROW_NEON)
@@ -72,11 +69,7 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
+ ARGBToYRow = ARGBToYRow_SSSE3;
}
}
@@ -133,10 +126,7 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToUV422Row = ARGBToUV422Row_SSSE3;
- }
+ ARGBToUV422Row = ARGBToUV422Row_SSSE3;
}
}
#elif defined(HAS_ARGBTOUV422ROW_NEON)
@@ -152,11 +142,7 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
+ ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#elif defined(HAS_ARGBTOYROW_NEON)
@@ -212,11 +198,7 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
+ ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
@@ -285,14 +267,8 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
- }
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#elif defined(HAS_ARGBTOYROW_NEON)
@@ -315,10 +291,7 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
MergeUVRow_ = MergeUVRow_Any_SSE2;
if (IS_ALIGNED(halfwidth, 16)) {
- MergeUVRow_ = MergeUVRow_Unaligned_SSE2;
- if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
- MergeUVRow_ = MergeUVRow_SSE2;
- }
+ MergeUVRow_ = MergeUVRow_SSE2;
}
}
#endif
@@ -392,14 +365,8 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
- }
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#elif defined(HAS_ARGBTOYROW_NEON)
@@ -422,10 +389,7 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
MergeUVRow_ = MergeUVRow_Any_SSE2;
if (IS_ALIGNED(halfwidth, 16)) {
- MergeUVRow_ = MergeUVRow_Unaligned_SSE2;
- if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
- MergeUVRow_ = MergeUVRow_SSE2;
- }
+ MergeUVRow_ = MergeUVRow_SSE2;
}
}
#endif
@@ -503,10 +467,7 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToUV422Row = ARGBToUV422Row_SSSE3;
- }
+ ARGBToUV422Row = ARGBToUV422Row_SSSE3;
}
}
#elif defined(HAS_ARGBTOUV422ROW_NEON)
@@ -521,10 +482,7 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
+ ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#elif defined(HAS_ARGBTOYROW_NEON)
@@ -605,10 +563,7 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToUV422Row = ARGBToUV422Row_SSSE3;
- }
+ ARGBToUV422Row = ARGBToUV422Row_SSSE3;
}
}
#elif defined(HAS_ARGBTOUV422ROW_NEON)
@@ -623,10 +578,7 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
+ ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#elif defined(HAS_ARGBTOYROW_NEON)
@@ -700,11 +652,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
+ ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
@@ -867,8 +815,7 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_rgb565 = 0;
}
#if defined(HAS_ARGBTORGB565ROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
@@ -915,8 +862,7 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_argb1555 = 0;
}
#if defined(HAS_ARGBTOARGB1555ROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2;
@@ -963,8 +909,7 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_argb4444 = 0;
}
#if defined(HAS_ARGBTOARGB4444ROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2;
@@ -1015,14 +960,8 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVJRow = ARGBToUVJRow_Unaligned_SSSE3;
- ARGBToYJRow = ARGBToYJRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToUVJRow = ARGBToUVJRow_SSSE3;
- if (IS_ALIGNED(dst_yj, 16) && IS_ALIGNED(dst_stride_yj, 16)) {
- ARGBToYJRow = ARGBToYJRow_SSSE3;
- }
- }
+ ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+ ARGBToYJRow = ARGBToYJRow_SSSE3;
}
}
#endif
@@ -1094,11 +1033,7 @@ int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToYJRow = ARGBToYJRow_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_yj, 16) && IS_ALIGNED(dst_stride_yj, 16)) {
- ARGBToYJRow = ARGBToYJRow_SSSE3;
- }
+ ARGBToYJRow = ARGBToYJRow_SSSE3;
}
}
#endif
diff --git a/source/format_conversion.cc b/source/format_conversion.cc
index 3c17371..53d15c0 100644
--- a/source/format_conversion.cc
+++ b/source/format_conversion.cc
@@ -77,8 +77,7 @@ int ARGBToBayer(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
#if defined(HAS_ARGBTOBAYERROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
ARGBToBayerRow = ARGBToBayerRow_SSSE3;
@@ -319,11 +318,8 @@ int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
ARGBToUVRow = ARGBToUVRow_SSSE3;
- if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
}
}
#elif defined(HAS_ARGBTOYROW_NEON)
diff --git a/source/rotate_argb.cc b/source/rotate_argb.cc
index a8d7fc2..5951f57 100644
--- a/source/rotate_argb.cc
+++ b/source/rotate_argb.cc
@@ -50,13 +50,11 @@ static void ARGBTranspose(const uint8* src, int src_stride,
void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C;
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4) && // Width of dest.
- IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4)) { // Width of dest.
ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
}
#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4) && // Width of dest.
- IS_ALIGNED(src, 4)) {
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4)) { // Width of dest.
ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
}
#endif
@@ -103,9 +101,7 @@ void ARGBRotate180(const uint8* src, int src_stride,
ARGBMirrorRow_C;
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
#if defined(HAS_ARGBMIRRORROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) {
ARGBMirrorRow = ARGBMirrorRow_SSSE3;
}
#endif
@@ -130,9 +126,7 @@ void ARGBRotate180(const uint8* src, int src_stride,
}
#endif
#if defined(HAS_COPYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width * 4, 32) &&
- IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width * 4, 32)) {
CopyRow = CopyRow_SSE2;
}
#endif
diff --git a/source/row_any.cc b/source/row_any.cc
index 31ab08a..0d58f5d 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -35,19 +35,19 @@ extern "C" {
}
#ifdef HAS_I422TOARGBROW_SSSE3
-YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C,
+YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, I422ToARGBRow_C,
1, 4, 7)
#endif // HAS_I422TOARGBROW_SSSE3
#ifdef HAS_I444TOARGBROW_SSSE3
-YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C,
+YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, I444ToARGBRow_C,
0, 4, 7)
-YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C,
+YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, I411ToARGBRow_C,
2, 4, 7)
-YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C,
+YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_SSSE3, I422ToBGRARow_C,
1, 4, 7)
-YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C,
+YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_SSSE3, I422ToABGRRow_C,
1, 4, 7)
-YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C,
+YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, I422ToRGBARow_C,
1, 4, 7)
// I422ToRGB565Row_SSSE3 is unaligned.
YANY(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, I422ToARGB4444Row_C,
@@ -102,9 +102,9 @@ YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15)
}
#ifdef HAS_NV12TOARGBROW_SSSE3
-NV2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_Unaligned_SSSE3, NV12ToARGBRow_C,
+NV2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, NV12ToARGBRow_C,
0, 4)
-NV2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C,
+NV2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, NV21ToARGBRow_C,
0, 4)
#endif // HAS_NV12TOARGBROW_SSSE3
#ifdef HAS_NV12TOARGBROW_NEON
@@ -145,15 +145,15 @@ RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C,
3, 4, 2)
#endif
#if defined(HAS_I400TOARGBROW_SSE2)
-RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_Unaligned_SSE2, I400ToARGBRow_C,
+RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, I400ToARGBRow_C,
7, 1, 4)
#endif
#if defined(HAS_YTOARGBROW_SSE2)
RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C,
7, 1, 4)
-RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_Unaligned_SSSE3, YUY2ToARGBRow_C,
+RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, YUY2ToARGBRow_C,
15, 2, 4)
-RGBANY(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_Unaligned_SSSE3, UYVYToARGBRow_C,
+RGBANY(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, UYVYToARGBRow_C,
15, 2, 4)
// These require alignment on ARGB, so C is used for remainder.
RGBANY(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, RGB24ToARGBRow_C,
@@ -231,17 +231,17 @@ YANY(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 2, 1, 32)
YANY(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 2, 1, 32)
#endif
#ifdef HAS_ARGBTOYROW_SSSE3
-YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4, 1, 16)
+YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 4, 1, 16)
#endif
#ifdef HAS_BGRATOYROW_SSSE3
-YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4, 1, 16)
-YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4, 1, 16)
-YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4, 1, 16)
-YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2, 1, 16)
-YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2, 1, 16)
+YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 4, 1, 16)
+YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 4, 1, 16)
+YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, 4, 1, 16)
+YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, 2, 1, 16)
+YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 2, 1, 16)
#endif
#ifdef HAS_ARGBTOYJROW_SSSE3
-YANY(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_Unaligned_SSSE3, 4, 1, 16)
+YANY(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 4, 1, 16)
#endif
#ifdef HAS_ARGBTOYROW_NEON
YANY(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 4, 1, 8)
@@ -349,14 +349,14 @@ UVANY(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, YUY2ToUVRow_C, 2, 31)
UVANY(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, UYVYToUVRow_C, 2, 31)
#endif
#ifdef HAS_ARGBTOUVROW_SSSE3
-UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4, 15)
-UVANY(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_Unaligned_SSSE3, ARGBToUVJRow_C,
+UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, ARGBToUVRow_C, 4, 15)
+UVANY(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, ARGBToUVJRow_C,
4, 15)
-UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4, 15)
-UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4, 15)
-UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4, 15)
-UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2, 15)
-UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2, 15)
+UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, BGRAToUVRow_C, 4, 15)
+UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, ABGRToUVRow_C, 4, 15)
+UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, RGBAToUVRow_C, 4, 15)
+UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_SSE2, YUY2ToUVRow_C, 2, 15)
+UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, UYVYToUVRow_C, 2, 15)
#endif
#ifdef HAS_ARGBTOUVROW_NEON
UVANY(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, ARGBToUVRow_C, 4, 15)
@@ -408,7 +408,7 @@ UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15)
}
#ifdef HAS_ARGBTOUV444ROW_SSSE3
-UV422ANY(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_Unaligned_SSSE3,
+UV422ANY(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3,
ARGBToUV444Row_C, 4, 15, 0)
#endif
#ifdef HAS_YUY2TOUV422ROW_AVX2
@@ -418,11 +418,11 @@ UV422ANY(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2,
UYVYToUV422Row_C, 2, 31, 1)
#endif
#ifdef HAS_ARGBTOUVROW_SSSE3
-UV422ANY(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_Unaligned_SSSE3,
+UV422ANY(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_SSSE3,
ARGBToUV422Row_C, 4, 15, 1)
-UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_Unaligned_SSE2,
+UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2,
YUY2ToUV422Row_C, 2, 15, 1)
-UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_Unaligned_SSE2,
+UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2,
UYVYToUV422Row_C, 2, 15, 1)
#endif
#ifdef HAS_YUY2TOUV422ROW_NEON
@@ -451,7 +451,7 @@ UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON,
}
#ifdef HAS_SPLITUVROW_SSE2
-SPLITUVROWANY(SplitUVRow_Any_SSE2, SplitUVRow_Unaligned_SSE2, SplitUVRow_C, 15)
+SPLITUVROWANY(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, SplitUVRow_C, 15)
#endif
#ifdef HAS_SPLITUVROW_AVX2
SPLITUVROWANY(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, SplitUVRow_C, 31)
@@ -460,7 +460,7 @@ SPLITUVROWANY(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, SplitUVRow_C, 31)
SPLITUVROWANY(SplitUVRow_Any_NEON, SplitUVRow_NEON, SplitUVRow_C, 15)
#endif
#ifdef HAS_SPLITUVROW_MIPS_DSPR2
-SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2,
+SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_MIPS_DSPR2,
SplitUVRow_C, 15)
#endif
#undef SPLITUVROWANY
@@ -477,7 +477,7 @@ SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2,
}
#ifdef HAS_MERGEUVROW_SSE2
-MERGEUVROW_ANY(MergeUVRow_Any_SSE2, MergeUVRow_Unaligned_SSE2, MergeUVRow_C, 15)
+MERGEUVROW_ANY(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, MergeUVRow_C, 15)
#endif
#ifdef HAS_MERGEUVROW_AVX2
MERGEUVROW_ANY(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, MergeUVRow_C, 31)
@@ -548,7 +548,7 @@ YANY(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2,
ARGBShuffleRow_C, 4, 4, 3)
#endif
#ifdef HAS_ARGBSHUFFLEROW_SSSE3
-YANY(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_Unaligned_SSSE3,
+YANY(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3,
ARGBShuffleRow_C, 4, 4, 7)
#endif
#ifdef HAS_ARGBSHUFFLEROW_AVX2
diff --git a/source/row_common.cc b/source/row_common.cc
index 40a8261..afc74c0 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -2137,19 +2137,6 @@ void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
free_aligned_buffer_64(row_y);
}
-void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2,
- uint8* dst_argb,
- int width) {
- // Allocate a rows of yuv.
- align_buffer_64(row_y, ((width + 63) & ~63) * 2);
- uint8* row_u = row_y + ((width + 63) & ~63);
- uint8* row_v = row_u + ((width + 63) & ~63) / 2;
- YUY2ToUV422Row_Unaligned_SSE2(src_yuy2, row_u, row_v, width);
- YUY2ToYRow_Unaligned_SSE2(src_yuy2, row_y, width);
- I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width);
- free_aligned_buffer_64(row_y);
-}
-
void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
uint8* dst_argb,
int width) {
@@ -2163,19 +2150,6 @@ void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
free_aligned_buffer_64(row_y);
}
-void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy,
- uint8* dst_argb,
- int width) {
- // Allocate a rows of yuv.
- align_buffer_64(row_y, ((width + 63) & ~63) * 2);
- uint8* row_u = row_y + ((width + 63) & ~63);
- uint8* row_v = row_u + ((width + 63) & ~63) / 2;
- UYVYToUV422Row_Unaligned_SSE2(src_uyvy, row_u, row_v, width);
- UYVYToYRow_Unaligned_SSE2(src_uyvy, row_y, width);
- I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width);
- free_aligned_buffer_64(row_y);
-}
-
#endif // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
#endif // !defined(LIBYUV_DISABLE_X86)
diff --git a/source/row_mips.cc b/source/row_mips.cc
index da7183b..d713321 100644
--- a/source/row_mips.cc
+++ b/source/row_mips.cc
@@ -447,89 +447,6 @@ void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
);
}
-void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,
- uint8* dst_v, int width) {
- __asm__ __volatile__ (
- ".set push \n"
- ".set noreorder \n"
- "srl $t4, %[width], 4 \n" // multiplies of 16
- "blez $t4, 2f \n"
- " andi %[width], %[width], 0xf \n" // residual
-
- ".p2align 2 \n"
- "1: \n"
- "addiu $t4, $t4, -1 \n"
- "lwr $t0, 0(%[src_uv]) \n"
- "lwl $t0, 3(%[src_uv]) \n" // V1 | U1 | V0 | U0
- "lwr $t1, 4(%[src_uv]) \n"
- "lwl $t1, 7(%[src_uv]) \n" // V3 | U3 | V2 | U2
- "lwr $t2, 8(%[src_uv]) \n"
- "lwl $t2, 11(%[src_uv]) \n" // V5 | U5 | V4 | U4
- "lwr $t3, 12(%[src_uv]) \n"
- "lwl $t3, 15(%[src_uv]) \n" // V7 | U7 | V6 | U6
- "lwr $t5, 16(%[src_uv]) \n"
- "lwl $t5, 19(%[src_uv]) \n" // V9 | U9 | V8 | U8
- "lwr $t6, 20(%[src_uv]) \n"
- "lwl $t6, 23(%[src_uv]) \n" // V11 | U11 | V10 | U10
- "lwr $t7, 24(%[src_uv]) \n"
- "lwl $t7, 27(%[src_uv]) \n" // V13 | U13 | V12 | U12
- "lwr $t8, 28(%[src_uv]) \n"
- "lwl $t8, 31(%[src_uv]) \n" // V15 | U15 | V14 | U14
- "precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0
- "precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0
- "precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4
- "precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4
- "precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8
- "precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8
- "precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 | V12
- "precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 | U12
- "addiu %[src_uv], %[src_uv], 32 \n"
- "swr $t9, 0(%[dst_v]) \n"
- "swl $t9, 3(%[dst_v]) \n"
- "swr $t0, 0(%[dst_u]) \n"
- "swl $t0, 3(%[dst_u]) \n"
- "swr $t1, 4(%[dst_v]) \n"
- "swl $t1, 7(%[dst_v]) \n"
- "swr $t2, 4(%[dst_u]) \n"
- "swl $t2, 7(%[dst_u]) \n"
- "swr $t3, 8(%[dst_v]) \n"
- "swl $t3, 11(%[dst_v]) \n"
- "swr $t5, 8(%[dst_u]) \n"
- "swl $t5, 11(%[dst_u]) \n"
- "swr $t6, 12(%[dst_v]) \n"
- "swl $t6, 15(%[dst_v]) \n"
- "swr $t7, 12(%[dst_u]) \n"
- "swl $t7, 15(%[dst_u]) \n"
- "addiu %[dst_u], %[dst_u], 16 \n"
- "bgtz $t4, 1b \n"
- " addiu %[dst_v], %[dst_v], 16 \n"
-
- "beqz %[width], 3f \n"
- " nop \n"
-
- "2: \n"
- "lbu $t0, 0(%[src_uv]) \n"
- "lbu $t1, 1(%[src_uv]) \n"
- "addiu %[src_uv], %[src_uv], 2 \n"
- "addiu %[width], %[width], -1 \n"
- "sb $t0, 0(%[dst_u]) \n"
- "sb $t1, 0(%[dst_v]) \n"
- "addiu %[dst_u], %[dst_u], 1 \n"
- "bgtz %[width], 2b \n"
- " addiu %[dst_v], %[dst_v], 1 \n"
-
- "3: \n"
- ".set pop \n"
- : [src_uv] "+r" (src_uv),
- [width] "+r" (width),
- [dst_u] "+r" (dst_u),
- [dst_v] "+r" (dst_v)
- :
- : "t0", "t1", "t2", "t3",
- "t4", "t5", "t6", "t7", "t8", "t9"
- );
-}
-
void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
__asm__ __volatile__ (
".set push \n"
diff --git a/source/row_posix.cc b/source/row_posix.cc
index e92c9f5..94b24a6 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -267,37 +267,6 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
#endif
);
}
-
-void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
- int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0x18,%%xmm5 \n"
- LABELALIGN
- "1: \n"
- "movq " MEMACCESS(0) ",%%xmm0 \n"
- "lea " MEMLEA(0x8,0) ",%0 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm0,%%xmm0 \n"
- "punpckhwd %%xmm1,%%xmm1 \n"
- "por %%xmm5,%%xmm0 \n"
- "por %%xmm5,%%xmm1 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src_y), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
#endif // HAS_I400TOARGBROW_SSE2
#ifdef HAS_RGB24TOARGBROW_SSSE3
@@ -775,7 +744,7 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
);
}
-void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
@@ -851,44 +820,6 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
#endif
);
}
-
-void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
- asm volatile (
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "paddw %%xmm5,%%xmm0 \n"
- "paddw %%xmm5,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- : "m"(kARGBToYJ), // %3
- "m"(kAddYJ64) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
#endif // HAS_ARGBTOYJROW_SSSE3
#ifdef HAS_ARGBTOUVROW_SSSE3
@@ -1031,147 +962,6 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
);
}
-void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kARGBToU), // %0
- "m"(kARGBToV), // %1
- "m"(kAddUV128) // %2
- );
- asm volatile (
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- BUNDLEALIGN
- MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm0 \n"
- MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm1 \n"
- MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm2 \n"
- MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm6 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movlps %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"((intptr_t)(src_stride_argb)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
- );
-}
-
-void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kARGBToUJ), // %0
- "m"(kARGBToVJ), // %1
- "m"(kAddUVJ128) // %2
- );
- asm volatile (
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- BUNDLEALIGN
- MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm0 \n"
- MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm1 \n"
- MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm2 \n"
- MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm6 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "paddw %%xmm5,%%xmm0 \n"
- "paddw %%xmm5,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movlps %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"((intptr_t)(src_stride_argb))
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
- );
-}
-
void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
int width) {
asm volatile (
@@ -1237,7 +1027,7 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
);
}
-void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u,
+void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u,
uint8* dst_v, int width) {
asm volatile (
"movdqa %0,%%xmm4 \n"
@@ -1363,67 +1153,6 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
);
}
-void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
- uint8* dst_u, uint8* dst_v, int width) {
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kARGBToU), // %0
- "m"(kARGBToV), // %1
- "m"(kAddUV128) // %2
- );
- asm volatile (
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movlps %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
- );
-}
-
void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
asm volatile (
"movdqa %4,%%xmm5 \n"
@@ -1461,43 +1190,6 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
);
}
-void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
- asm volatile (
- "movdqa %4,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_bgra), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- : "m"(kBGRAToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
uint8* dst_u, uint8* dst_v, int width) {
asm volatile (
@@ -1564,76 +1256,6 @@ void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
);
}
-void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
- uint8* dst_u, uint8* dst_v, int width) {
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kBGRAToU), // %0
- "m"(kBGRAToV), // %1
- "m"(kAddUV128) // %2
- );
- asm volatile (
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- BUNDLEALIGN
- MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm0 \n"
- MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm1 \n"
- MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm2 \n"
- MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm6 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movlps %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_bgra0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"((intptr_t)(src_stride_bgra)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
- );
-}
-
void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
asm volatile (
"movdqa %4,%%xmm5 \n"
@@ -1671,43 +1293,6 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
);
}
-void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
- asm volatile (
- "movdqa %4,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_abgr), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- : "m"(kABGRToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
asm volatile (
"movdqa %4,%%xmm5 \n"
@@ -1745,7 +1330,7 @@ void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
);
}
-void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
+void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
asm volatile (
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
@@ -1848,76 +1433,6 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
);
}
-void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
- uint8* dst_u, uint8* dst_v, int width) {
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kABGRToU), // %0
- "m"(kABGRToV), // %1
- "m"(kAddUV128) // %2
- );
- asm volatile (
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- BUNDLEALIGN
- MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm0 \n"
- MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm1 \n"
- MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm2 \n"
- MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm6 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movlps %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_abgr0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"((intptr_t)(src_stride_abgr)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
- );
-}
-
void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
uint8* dst_u, uint8* dst_v, int width) {
asm volatile (
@@ -1983,76 +1498,6 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
#endif
);
}
-
-void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
- uint8* dst_u, uint8* dst_v, int width) {
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kRGBAToU), // %0
- "m"(kRGBAToV), // %1
- "m"(kAddUV128) // %2
- );
- asm volatile (
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- BUNDLEALIGN
- MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm0 \n"
- MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm1 \n"
- MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm2 \n"
- MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm6 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "sub $0x10,%3 \n"
- "movlps %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_rgba0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"((intptr_t)(src_stride_rgba)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
- );
-}
#endif // HAS_ARGBTOUVROW_SSSE3
#ifdef HAS_I422TOARGBROW_SSSE3
@@ -2482,191 +1927,6 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
);
}
-void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- int width) {
- asm volatile (
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- READYUV444
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
- "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
- "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- int width) {
- asm volatile (
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- READYUV422
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
- "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
- "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- int width) {
- asm volatile (
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- READYUV411
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
- "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
- "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* dst_argb,
- int width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- READNV12
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
- "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
- "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [uv_buf]"+r"(uv_buf), // %[uv_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
- // Does not use r14.
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* dst_argb,
- int width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- READNV12
- YVUTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
- "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
- "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [uv_buf]"+r"(uv_buf), // %[uv_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
- // Does not use r14.
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
@@ -2786,125 +2046,6 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
);
}
-void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_bgra,
- int width) {
- asm volatile (
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- READYUV422
- YUVTORGB
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "punpcklbw %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm5 \n"
- "movdqa %%xmm5,%%xmm0 \n"
- "punpcklwd %%xmm1,%%xmm5 \n"
- "punpckhwd %%xmm1,%%xmm0 \n"
- "movdqu %%xmm5," MEMACCESS([dst_bgra]) "\n"
- "movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n"
- "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_abgr,
- int width) {
- asm volatile (
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- READYUV422
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "punpcklwd %%xmm0,%%xmm2 \n"
- "punpckhwd %%xmm0,%%xmm1 \n"
- "movdqu %%xmm2," MEMACCESS([dst_abgr]) "\n"
- "movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n"
- "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_rgba,
- int width) {
- asm volatile (
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- READYUV422
- YUVTORGB
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "punpcklbw %%xmm2,%%xmm1 \n"
- "punpcklbw %%xmm0,%%xmm5 \n"
- "movdqa %%xmm5,%%xmm0 \n"
- "punpcklwd %%xmm1,%%xmm5 \n"
- "punpckhwd %%xmm1,%%xmm0 \n"
- "movdqu %%xmm5," MEMACCESS([dst_rgba]) "\n"
- "movdqu %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n"
- "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"
- "sub $0x8,%[width] \n"
- "jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
- [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
- [width]"+rm"(width) // %[width]
- : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
#endif // HAS_I422TOARGBROW_SSSE3
#ifdef HAS_YTOARGBROW_SSE2
@@ -3134,45 +2275,6 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
#endif
);
}
-
-void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "psrlw $0x8,%%xmm2 \n"
- "psrlw $0x8,%%xmm3 \n"
- "packuswb %%xmm3,%%xmm2 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_uv), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(pix) // %3
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
- );
-}
#endif // HAS_SPLITUVROW_SSE2
#ifdef HAS_MERGEUVROW_SSE2
@@ -3207,38 +2309,6 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
#endif
);
}
-
-void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
- uint8* dst_uv, int width) {
- asm volatile (
- "sub %0,%1 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm2 \n"
- "movdqu %%xmm0," MEMACCESS(2) " \n"
- "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
- "lea " MEMLEA(0x20,2) ",%2 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_u), // %0
- "+r"(src_v), // %1
- "+r"(dst_uv), // %2
- "+r"(width) // %3
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2"
-#endif
- );
-}
#endif // HAS_MERGEUVROW_SSE2
#ifdef HAS_COPYROW_SSE2
@@ -3307,7 +2377,7 @@ void CopyRow_X86(const uint8* src, uint8* dst, int width) {
#endif // HAS_COPYROW_X86
#ifdef HAS_COPYROW_ERMS
-// Unaligned Multiple of 1.
+// Multiple of 1.
void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
size_t width_tmp = (size_t)(width);
asm volatile (
@@ -3604,120 +2674,6 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
);
}
-void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
- uint8* dst_y, int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-
-void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
- int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- BUNDLEALIGN
- MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
- MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(pix) // %3
- : "r"((intptr_t)(stride_yuy2)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
- );
-}
-
-void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(pix) // %3
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-
void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
asm volatile (
LABELALIGN
@@ -3827,117 +2783,6 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
#endif
);
}
-
-void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
- uint8* dst_y, int pix) {
- asm volatile (
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
- );
-}
-
-void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- BUNDLEALIGN
- MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
- MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(pix) // %3
- : "r"((intptr_t)(stride_uyvy)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
- );
-}
-
-void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int pix) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- BUNDLEALIGN
- MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(pix) // %3
- :
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
#endif // HAS_YUY2TOYROW_SSE2
#ifdef HAS_ARGBBLENDROW_SSE2
@@ -5581,238 +4426,6 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
}
#endif // HAS_INTERPOLATEROW_SSE2
-#ifdef HAS_INTERPOLATEROW_SSSE3
-// Bilinear filter 16x2 -> 16x1
-void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
- asm volatile (
- "sub %1,%0 \n"
- "shr %3 \n"
- "cmp $0x0,%3 \n"
- "je 100f \n"
- "cmp $0x20,%3 \n"
- "je 75f \n"
- "cmp $0x40,%3 \n"
- "je 50f \n"
- "cmp $0x60,%3 \n"
- "je 25f \n"
-
- "movd %3,%%xmm0 \n"
- "neg %3 \n"
- "add $0x80,%3 \n"
- "movd %3,%%xmm5 \n"
- "punpcklbw %%xmm0,%%xmm5 \n"
- "punpcklwd %%xmm5,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
-
- // General purpose row blend.
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(1) ",%%xmm0 \n"
- MEMOPREG(movdqu,0x00,1,4,1,xmm2)
- "movdqu %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm0 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "pmaddubsw %%xmm5,%%xmm0 \n"
- "pmaddubsw %%xmm5,%%xmm1 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- "jmp 99f \n"
-
- // Blend 25 / 75.
- LABELALIGN
- "25: \n"
- "movdqu " MEMACCESS(1) ",%%xmm0 \n"
- MEMOPREG(movdqu,0x00,1,4,1,xmm1)
- "pavgb %%xmm1,%%xmm0 \n"
- "pavgb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 25b \n"
- "jmp 99f \n"
-
- // Blend 50 / 50.
- LABELALIGN
- "50: \n"
- "movdqu " MEMACCESS(1) ",%%xmm0 \n"
- MEMOPREG(movdqu,0x00,1,4,1,xmm1)
- "pavgb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 50b \n"
- "jmp 99f \n"
-
- // Blend 75 / 25.
- LABELALIGN
- "75: \n"
- "movdqu " MEMACCESS(1) ",%%xmm1 \n"
- MEMOPREG(movdqu,0x00,1,4,1,xmm0)
- "pavgb %%xmm1,%%xmm0 \n"
- "pavgb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 75b \n"
- "jmp 99f \n"
-
- // Blend 100 / 0 - Copy row unchanged.
- LABELALIGN
- "100: \n"
- "movdqu " MEMACCESS(1) ",%%xmm0 \n"
- "sub $0x10,%2 \n"
- MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 100b \n"
-
- "99: \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(source_y_fraction) // %3
- : "r"((intptr_t)(src_stride)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm5"
-#endif
- );
-}
-#endif // HAS_INTERPOLATEROW_SSSE3
-
-#ifdef HAS_INTERPOLATEROW_SSE2
-// Bilinear filter 16x2 -> 16x1
-void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
- asm volatile (
- "sub %1,%0 \n"
- "shr %3 \n"
- "cmp $0x0,%3 \n"
- "je 100f \n"
- "cmp $0x20,%3 \n"
- "je 75f \n"
- "cmp $0x40,%3 \n"
- "je 50f \n"
- "cmp $0x60,%3 \n"
- "je 25f \n"
-
- "movd %3,%%xmm0 \n"
- "neg %3 \n"
- "add $0x80,%3 \n"
- "movd %3,%%xmm5 \n"
- "punpcklbw %%xmm0,%%xmm5 \n"
- "punpcklwd %%xmm5,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
-
- // General purpose row blend.
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(1) ",%%xmm0 \n"
- MEMOPREG(movdqu,0x00,1,4,1,xmm2) // movdqu (%1,%4,1),%%xmm2
- "movdqu %%xmm0,%%xmm1 \n"
- "movdqu %%xmm2,%%xmm3 \n"
- "punpcklbw %%xmm4,%%xmm2 \n"
- "punpckhbw %%xmm4,%%xmm3 \n"
- "punpcklbw %%xmm4,%%xmm0 \n"
- "punpckhbw %%xmm4,%%xmm1 \n"
- "psubw %%xmm0,%%xmm2 \n"
- "psubw %%xmm1,%%xmm3 \n"
- "paddw %%xmm2,%%xmm2 \n"
- "paddw %%xmm3,%%xmm3 \n"
- "pmulhw %%xmm5,%%xmm2 \n"
- "pmulhw %%xmm5,%%xmm3 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm3,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 1b \n"
- "jmp 99f \n"
-
- // Blend 25 / 75.
- LABELALIGN
- "25: \n"
- "movdqu " MEMACCESS(1) ",%%xmm0 \n"
- MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1
- "pavgb %%xmm1,%%xmm0 \n"
- "pavgb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 25b \n"
- "jmp 99f \n"
-
- // Blend 50 / 50.
- LABELALIGN
- "50: \n"
- "movdqu " MEMACCESS(1) ",%%xmm0 \n"
- MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1
- "pavgb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 50b \n"
- "jmp 99f \n"
-
- // Blend 75 / 25.
- LABELALIGN
- "75: \n"
- "movdqu " MEMACCESS(1) ",%%xmm1 \n"
- MEMOPREG(movdqu,0x00,1,4,1,xmm0) // movdqu (%1,%4,1),%%xmm0
- "pavgb %%xmm1,%%xmm0 \n"
- "pavgb %%xmm1,%%xmm0 \n"
- "sub $0x10,%2 \n"
- BUNDLEALIGN
- MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 75b \n"
- "jmp 99f \n"
-
- // Blend 100 / 0 - Copy row unchanged.
- LABELALIGN
- "100: \n"
- "movdqu " MEMACCESS(1) ",%%xmm0 \n"
- "sub $0x10,%2 \n"
- MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "jg 100b \n"
-
- "99: \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(source_y_fraction) // %3
- : "r"((intptr_t)(src_stride)) // %4
- : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
- , "r14"
-#endif
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-#endif // HAS_INTERPOLATEROW_SSE2
-
#ifdef HAS_ARGBTOBAYERROW_SSSE3
void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
uint32 selector, int pix) {
@@ -5905,33 +4518,6 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
#endif
);
}
-
-void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int pix) {
- asm volatile (
- "movdqu " MEMACCESS(3) ",%%xmm5 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "sub $0x8,%2 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
- : "r"(shuffler) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
#endif // HAS_ARGBSHUFFLEROW_SSSE3
#ifdef HAS_ARGBSHUFFLEROW_AVX2
diff --git a/source/row_win.cc b/source/row_win.cc
index 9996ccd..a5b99d7 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -126,58 +126,6 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
}
}
-// Unaligned destination version.
-void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- int width) {
- __m128i xmm0, xmm1, xmm2, xmm3;
- const __m128i xmm5 = _mm_set1_epi8(-1);
- const __m128i xmm4 = _mm_setzero_si128();
- const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
-
- while (width > 0) {
- xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
- xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
- xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
- xmm1 = _mm_load_si128(&xmm0);
- xmm2 = _mm_load_si128(&xmm0);
- xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB);
- xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG);
- xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR);
- xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB);
- xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);
- xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);
- xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
- xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
- xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);
- xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);
- xmm0 = _mm_adds_epi16(xmm0, xmm3);
- xmm1 = _mm_adds_epi16(xmm1, xmm3);
- xmm2 = _mm_adds_epi16(xmm2, xmm3);
- xmm0 = _mm_srai_epi16(xmm0, 6);
- xmm1 = _mm_srai_epi16(xmm1, 6);
- xmm2 = _mm_srai_epi16(xmm2, 6);
- xmm0 = _mm_packus_epi16(xmm0, xmm0);
- xmm1 = _mm_packus_epi16(xmm1, xmm1);
- xmm2 = _mm_packus_epi16(xmm2, xmm2);
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
- xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
- xmm1 = _mm_load_si128(&xmm0);
- xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
- xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
-
- _mm_storeu_si128((__m128i *)dst_argb, xmm0);
- _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);
-
- y_buf += 8;
- u_buf += 4;
- dst_argb += 32;
- width -= 8;
- }
-}
// 32 bit
#else // defined(_M_X64)
@@ -336,35 +284,6 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
}
__declspec(naked) __declspec(align(16))
-void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
- int pix) {
- __asm {
- mov eax, [esp + 4] // src_y
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0xff000000
- pslld xmm5, 24
-
- align 4
- convertloop:
- movq xmm0, qword ptr [eax]
- lea eax, [eax + 8]
- punpcklbw xmm0, xmm0
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm0
- punpckhwd xmm1, xmm1
- por xmm0, xmm5
- por xmm1, xmm5
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
__asm {
mov eax, [esp + 4] // src_rgb24
@@ -937,75 +856,6 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
#endif // HAS_ARGBTOYJROW_AVX2
__declspec(naked) __declspec(align(16))
-void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* pix */
- movdqa xmm5, kAddY16
- movdqa xmm4, kARGBToY
-
- align 4
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- lea eax, [eax + 64]
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psrlw xmm0, 7
- psrlw xmm2, 7
- packuswb xmm0, xmm2
- paddb xmm0, xmm5
- sub ecx, 16
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* pix */
- movdqa xmm4, kARGBToYJ
- movdqa xmm5, kAddYJ64
-
- align 4
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- lea eax, [eax + 64]
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- paddw xmm0, xmm5
- paddw xmm2, xmm5
- psrlw xmm0, 7
- psrlw xmm2, 7
- packuswb xmm0, xmm2
- sub ecx, 16
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] /* src_argb */
@@ -1040,40 +890,6 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
}
__declspec(naked) __declspec(align(16))
-void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* pix */
- movdqa xmm5, kAddY16
- movdqa xmm4, kBGRAToY
-
- align 4
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- lea eax, [eax + 64]
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psrlw xmm0, 7
- psrlw xmm2, 7
- packuswb xmm0, xmm2
- paddb xmm0, xmm5
- sub ecx, 16
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] /* src_argb */
@@ -1108,40 +924,6 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
}
__declspec(naked) __declspec(align(16))
-void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* pix */
- movdqa xmm5, kAddY16
- movdqa xmm4, kABGRToY
-
- align 4
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- lea eax, [eax + 64]
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psrlw xmm0, 7
- psrlw xmm2, 7
- packuswb xmm0, xmm2
- paddb xmm0, xmm5
- sub ecx, 16
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] /* src_argb */
@@ -1176,40 +958,6 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
}
__declspec(naked) __declspec(align(16))
-void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] /* src_argb */
- mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* pix */
- movdqa xmm5, kAddY16
- movdqa xmm4, kRGBAToY
-
- align 4
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- pmaddubsw xmm0, xmm4
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- lea eax, [eax + 64]
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psrlw xmm0, 7
- psrlw xmm2, 7
- packuswb xmm0, xmm2
- paddb xmm0, xmm5
- sub ecx, 16
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
__asm {
@@ -1410,147 +1158,6 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
#endif // HAS_ARGBTOUVROW_AVX2
__declspec(naked) __declspec(align(16))
-void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, kARGBToU
- movdqa xmm6, kARGBToV
- movdqa xmm5, kAddUV128
- sub edi, edx // stride from u to v
-
- align 4
- convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- movdqu xmm4, [eax + esi]
- pavgb xmm0, xmm4
- movdqu xmm4, [eax + esi + 16]
- pavgb xmm1, xmm4
- movdqu xmm4, [eax + esi + 32]
- pavgb xmm2, xmm4
- movdqu xmm4, [eax + esi + 48]
- pavgb xmm3, xmm4
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
-
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
-
- // step 3 - store 8 U and 8 V values
- sub ecx, 16
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, kARGBToUJ
- movdqa xmm6, kARGBToVJ
- movdqa xmm5, kAddUVJ128
- sub edi, edx // stride from u to v
-
- align 4
- convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- movdqu xmm4, [eax + esi]
- pavgb xmm0, xmm4
- movdqu xmm4, [eax + esi + 16]
- pavgb xmm1, xmm4
- movdqu xmm4, [eax + esi + 32]
- pavgb xmm2, xmm4
- movdqu xmm4, [eax + esi + 48]
- pavgb xmm3, xmm4
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
-
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- paddw xmm0, xmm5 // +.5 rounding -> unsigned
- paddw xmm1, xmm5
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
-
- // step 3 - store 8 U and 8 V values
- sub ecx, 16
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
uint8* dst_u, uint8* dst_v, int width) {
__asm {
@@ -1609,64 +1216,6 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
}
__declspec(naked) __declspec(align(16))
-void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_argb
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // pix
- movdqa xmm7, kARGBToU
- movdqa xmm6, kARGBToV
- movdqa xmm5, kAddUV128
- sub edi, edx // stride from u to v
-
- align 4
- convertloop:
- /* convert to U and V */
- movdqu xmm0, [eax] // U
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- pmaddubsw xmm0, xmm7
- pmaddubsw xmm1, xmm7
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm3, xmm7
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psraw xmm0, 8
- psraw xmm2, 8
- packsswb xmm0, xmm2
- paddb xmm0, xmm5
- sub ecx, 16
- movdqu [edx], xmm0
-
- movdqu xmm0, [eax] // V
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- pmaddubsw xmm0, xmm6
- pmaddubsw xmm1, xmm6
- pmaddubsw xmm2, xmm6
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm1
- phaddw xmm2, xmm3
- psraw xmm0, 8
- psraw xmm2, 8
- packsswb xmm0, xmm2
- paddb xmm0, xmm5
- lea eax, [eax + 64]
- movdqu [edx + edi], xmm0
- lea edx, [edx + 16]
- jg convertloop
-
- pop edi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
uint8* dst_u, uint8* dst_v, int width) {
__asm {
@@ -1726,65 +1275,6 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
}
__declspec(naked) __declspec(align(16))
-void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_argb
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // pix
- movdqa xmm7, kARGBToU
- movdqa xmm6, kARGBToV
- movdqa xmm5, kAddUV128
- sub edi, edx // stride from u to v
-
- align 4
- convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
-
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
-
- // step 3 - store 8 U and 8 V values
- sub ecx, 16
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
- jg convertloop
-
- pop edi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
__asm {
@@ -1851,76 +1341,6 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
}
__declspec(naked) __declspec(align(16))
-void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, kBGRAToU
- movdqa xmm6, kBGRAToV
- movdqa xmm5, kAddUV128
- sub edi, edx // stride from u to v
-
- align 4
- convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- movdqu xmm4, [eax + esi]
- pavgb xmm0, xmm4
- movdqu xmm4, [eax + esi + 16]
- pavgb xmm1, xmm4
- movdqu xmm4, [eax + esi + 32]
- pavgb xmm2, xmm4
- movdqu xmm4, [eax + esi + 48]
- pavgb xmm3, xmm4
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
-
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
-
- // step 3 - store 8 U and 8 V values
- sub ecx, 16
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
__asm {
@@ -1987,76 +1407,6 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
}
__declspec(naked) __declspec(align(16))
-void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, kABGRToU
- movdqa xmm6, kABGRToV
- movdqa xmm5, kAddUV128
- sub edi, edx // stride from u to v
-
- align 4
- convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- movdqu xmm4, [eax + esi]
- pavgb xmm0, xmm4
- movdqu xmm4, [eax + esi + 16]
- pavgb xmm1, xmm4
- movdqu xmm4, [eax + esi + 32]
- pavgb xmm2, xmm4
- movdqu xmm4, [eax + esi + 48]
- pavgb xmm3, xmm4
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
-
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
-
- // step 3 - store 8 U and 8 V values
- sub ecx, 16
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
__asm {
@@ -2122,75 +1472,6 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
}
}
-__declspec(naked) __declspec(align(16))
-void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, kRGBAToU
- movdqa xmm6, kRGBAToV
- movdqa xmm5, kAddUV128
- sub edi, edx // stride from u to v
-
- align 4
- convertloop:
- /* step 1 - subsample 16x2 argb pixels to 8x1 */
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + 32]
- movdqu xmm3, [eax + 48]
- movdqu xmm4, [eax + esi]
- pavgb xmm0, xmm4
- movdqu xmm4, [eax + esi + 16]
- pavgb xmm1, xmm4
- movdqu xmm4, [eax + esi + 32]
- pavgb xmm2, xmm4
- movdqu xmm4, [eax + esi + 48]
- pavgb xmm3, xmm4
- lea eax, [eax + 64]
- movdqa xmm4, xmm0
- shufps xmm0, xmm1, 0x88
- shufps xmm4, xmm1, 0xdd
- pavgb xmm0, xmm4
- movdqa xmm4, xmm2
- shufps xmm2, xmm3, 0x88
- shufps xmm4, xmm3, 0xdd
- pavgb xmm2, xmm4
-
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm6 // V
- pmaddubsw xmm3, xmm6
- phaddw xmm0, xmm2
- phaddw xmm1, xmm3
- psraw xmm0, 8
- psraw xmm1, 8
- packsswb xmm0, xmm1
- paddb xmm0, xmm5 // -> unsigned
-
- // step 3 - store 8 U and 8 V values
- sub ecx, 16
- movlps qword ptr [edx], xmm0 // U
- movhps qword ptr [edx + edi], xmm0 // V
- lea edx, [edx + 8]
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
#endif // HAS_ARGBTOYROW_SSSE3
#ifdef HAS_I422TOARGBROW_AVX2
@@ -2529,7 +1810,7 @@ void I422ToRAWRow_SSSE3(const uint8* y_buf,
}
}
-// 8 pixels, dest unaligned.
+// 8 pixels
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked) __declspec(align(16))
void I422ToRGB565Row_SSSE3(const uint8* y_buf,
@@ -2767,214 +2048,6 @@ void NV21ToARGBRow_SSSE3(const uint8* y_buf,
}
}
-// 8 pixels, unaligned.
-// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // argb
- mov ecx, [esp + 8 + 20] // width
- sub edi, esi
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
-
- align 4
- convertloop:
- READYUV444
- YUVTORGB
-
- // Step 3: Weave into ARGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm5 // RA
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRA first 4 pixels
- punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-// 8 pixels, unaligned.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // argb
- mov ecx, [esp + 8 + 20] // width
- sub edi, esi
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
-
- align 4
- convertloop:
- READYUV422
- YUVTORGB
-
- // Step 3: Weave into ARGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm5 // RA
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRA first 4 pixels
- punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-// 8 pixels, unaligned.
-// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-// Similar to I420 but duplicate UV once more.
-__declspec(naked) __declspec(align(16))
-void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- int width) {
- __asm {
- push ebx
- push esi
- push edi
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
- mov edi, [esp + 12 + 12] // V
- mov edx, [esp + 12 + 16] // argb
- mov ecx, [esp + 12 + 20] // width
- sub edi, esi
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
-
- align 4
- convertloop:
- READYUV411 // modifies EBX
- YUVTORGB
-
- // Step 3: Weave into ARGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm5 // RA
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRA first 4 pixels
- punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop edi
- pop esi
- pop ebx
- ret
- }
-}
-
-// 8 pixels, dest aligned 16.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* dst_argb,
- int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // Y
- mov esi, [esp + 4 + 8] // UV
- mov edx, [esp + 4 + 12] // argb
- mov ecx, [esp + 4 + 16] // width
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
-
- align 4
- convertloop:
- READNV12
- YUVTORGB
-
- // Step 3: Weave into ARGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm5 // RA
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRA first 4 pixels
- punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop esi
- ret
- }
-}
-
-// 8 pixels, dest aligned 16.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* dst_argb,
- int width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // Y
- mov esi, [esp + 4 + 8] // VU
- mov edx, [esp + 4 + 12] // argb
- mov ecx, [esp + 4 + 16] // width
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
-
- align 4
- convertloop:
- READNV12
- YVUTORGB
-
- // Step 3: Weave into ARGB
- punpcklbw xmm0, xmm1 // BG
- punpcklbw xmm2, xmm5 // RA
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm2 // BGRA first 4 pixels
- punpckhwd xmm1, xmm2 // BGRA next 4 pixels
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop esi
- ret
- }
-}
-
__declspec(naked) __declspec(align(16))
void I422ToBGRARow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
@@ -3017,47 +2090,6 @@ void I422ToBGRARow_SSSE3(const uint8* y_buf,
}
__declspec(naked) __declspec(align(16))
-void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_bgra,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // bgra
- mov ecx, [esp + 8 + 20] // width
- sub edi, esi
- pxor xmm4, xmm4
-
- align 4
- convertloop:
- READYUV422
- YUVTORGB
-
- // Step 3: Weave into BGRA
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- punpcklbw xmm1, xmm0 // GB
- punpcklbw xmm5, xmm2 // AR
- movdqa xmm0, xmm5
- punpcklwd xmm5, xmm1 // BGRA first 4 pixels
- punpckhwd xmm0, xmm1 // BGRA next 4 pixels
- movdqu [edx], xmm5
- movdqu [edx + 16], xmm0
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
void I422ToABGRRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
@@ -3099,47 +2131,6 @@ void I422ToABGRRow_SSSE3(const uint8* y_buf,
}
__declspec(naked) __declspec(align(16))
-void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_abgr,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // abgr
- mov ecx, [esp + 8 + 20] // width
- sub edi, esi
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- pxor xmm4, xmm4
-
- align 4
- convertloop:
- READYUV422
- YUVTORGB
-
- // Step 3: Weave into ARGB
- punpcklbw xmm2, xmm1 // RG
- punpcklbw xmm0, xmm5 // BA
- movdqa xmm1, xmm2
- punpcklwd xmm2, xmm0 // RGBA first 4 pixels
- punpckhwd xmm1, xmm0 // RGBA next 4 pixels
- movdqu [edx], xmm2
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
void I422ToRGBARow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
@@ -3180,47 +2171,6 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf,
}
}
-__declspec(naked) __declspec(align(16))
-void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_rgba,
- int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // rgba
- mov ecx, [esp + 8 + 20] // width
- sub edi, esi
- pxor xmm4, xmm4
-
- align 4
- convertloop:
- READYUV422
- YUVTORGB
-
- // Step 3: Weave into RGBA
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
- punpcklbw xmm1, xmm2 // GR
- punpcklbw xmm5, xmm0 // AB
- movdqa xmm0, xmm5
- punpcklwd xmm5, xmm1 // RGBA first 4 pixels
- punpckhwd xmm0, xmm1 // RGBA next 4 pixels
- movdqu [edx], xmm5
- movdqu [edx + 16], xmm0
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
#endif // HAS_I422TOARGBROW_SSSE3
#ifdef HAS_YTOARGBROW_SSE2
@@ -3331,8 +2281,6 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
#endif // HAS_MIRRORROW_AVX2
#ifdef HAS_MIRRORROW_SSE2
-// SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
-// version can not.
__declspec(naked) __declspec(align(16))
void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
__asm {
@@ -3490,42 +2438,6 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
}
}
-__declspec(naked) __declspec(align(16))
-void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int pix) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_uv
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- sub edi, edx
-
- align 4
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- pand xmm0, xmm5 // even bytes
- pand xmm1, xmm5
- packuswb xmm0, xmm1
- psrlw xmm2, 8 // odd bytes
- psrlw xmm3, 8
- packuswb xmm2, xmm3
- movdqu [edx], xmm0
- movdqu [edx + edi], xmm2
- lea edx, [edx + 16]
- sub ecx, 16
- jg convertloop
-
- pop edi
- ret
- }
-}
#endif // HAS_SPLITUVROW_SSE2
#ifdef HAS_SPLITUVROW_AVX2
@@ -3597,36 +2509,6 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
ret
}
}
-
-__declspec(naked) __declspec(align(16))
-void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
- uint8* dst_uv, int width) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_u
- mov edx, [esp + 4 + 8] // src_v
- mov edi, [esp + 4 + 12] // dst_uv
- mov ecx, [esp + 4 + 16] // width
- sub edx, eax
-
- align 4
- convertloop:
- movdqu xmm0, [eax] // read 16 U's
- movdqu xmm1, [eax + edx] // and 16 V's
- lea eax, [eax + 16]
- movdqa xmm2, xmm0
- punpcklbw xmm0, xmm1 // first 8 UV pairs
- punpckhbw xmm2, xmm1 // next 8 UV pairs
- movdqu [edi], xmm0
- movdqu [edi + 16], xmm2
- lea edi, [edi + 32]
- sub ecx, 16
- jg convertloop
-
- pop edi
- ret
- }
-}
#endif // HAS_MERGEUVROW_SSE2
#ifdef HAS_MERGEUVROW_AVX2
@@ -3713,7 +2595,7 @@ void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
}
#endif // HAS_COPYROW_AVX
-// Unaligned Multiple of 1.
+// Multiple of 1.
__declspec(naked) __declspec(align(16))
void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
__asm {
@@ -3730,7 +2612,7 @@ void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
}
#ifdef HAS_COPYROW_X86
-// Unaligned Multiple of 4.
+// Multiple of 4.
__declspec(naked) __declspec(align(16))
void CopyRow_X86(const uint8* src, uint8* dst, int count) {
__asm {
@@ -4269,113 +3151,6 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
}
__declspec(naked) __declspec(align(16))
-void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
- uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] // src_yuy2
- mov edx, [esp + 8] // dst_y
- mov ecx, [esp + 12] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
-
- align 4
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- pand xmm0, xmm5 // even bytes are Y
- pand xmm1, xmm5
- packuswb xmm0, xmm1
- sub ecx, 16
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int pix) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_yuy2
- mov esi, [esp + 8 + 8] // stride_yuy2
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- sub edi, edx
-
- align 4
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + esi]
- movdqu xmm3, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm0, xmm2
- pavgb xmm1, xmm3
- psrlw xmm0, 8 // YUYV -> UVUV
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- movdqa xmm1, xmm0
- pand xmm0, xmm5 // U
- packuswb xmm0, xmm0
- psrlw xmm1, 8 // V
- packuswb xmm1, xmm1
- movq qword ptr [edx], xmm0
- movq qword ptr [edx + edi], xmm1
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int pix) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_yuy2
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- sub edi, edx
-
- align 4
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- psrlw xmm0, 8 // YUYV -> UVUV
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- movdqa xmm1, xmm0
- pand xmm0, xmm5 // U
- packuswb xmm0, xmm0
- psrlw xmm1, 8 // V
- packuswb xmm1, xmm1
- movq qword ptr [edx], xmm0
- movq qword ptr [edx + edi], xmm1
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
-
- pop edi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
void UYVYToYRow_SSE2(const uint8* src_uyvy,
uint8* dst_y, int pix) {
__asm {
@@ -4479,111 +3254,6 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
ret
}
}
-
-__declspec(naked) __declspec(align(16))
-void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
- uint8* dst_y, int pix) {
- __asm {
- mov eax, [esp + 4] // src_uyvy
- mov edx, [esp + 8] // dst_y
- mov ecx, [esp + 12] // pix
-
- align 4
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- psrlw xmm0, 8 // odd bytes are Y
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- sub ecx, 16
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- jg convertloop
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int pix) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_yuy2
- mov esi, [esp + 8 + 8] // stride_yuy2
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- sub edi, edx
-
- align 4
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + esi]
- movdqu xmm3, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm0, xmm2
- pavgb xmm1, xmm3
- pand xmm0, xmm5 // UYVY -> UVUV
- pand xmm1, xmm5
- packuswb xmm0, xmm1
- movdqa xmm1, xmm0
- pand xmm0, xmm5 // U
- packuswb xmm0, xmm0
- psrlw xmm1, 8 // V
- packuswb xmm1, xmm1
- movq qword ptr [edx], xmm0
- movq qword ptr [edx + edi], xmm1
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-__declspec(naked) __declspec(align(16))
-void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int pix) {
- __asm {
- push edi
- mov eax, [esp + 4 + 4] // src_yuy2
- mov edx, [esp + 4 + 8] // dst_u
- mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // pix
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
- sub edi, edx
-
- align 4
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- pand xmm0, xmm5 // UYVY -> UVUV
- pand xmm1, xmm5
- packuswb xmm0, xmm1
- movdqa xmm1, xmm0
- pand xmm0, xmm5 // U
- packuswb xmm0, xmm0
- psrlw xmm1, 8 // V
- packuswb xmm1, xmm1
- movq qword ptr [edx], xmm0
- movq qword ptr [edx + edi], xmm1
- lea edx, [edx + 8]
- sub ecx, 16
- jg convertloop
-
- pop edi
- ret
- }
-}
#endif // HAS_YUY2TOYROW_SSE2
#ifdef HAS_ARGBBLENDROW_SSE2
@@ -6624,32 +5294,6 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
}
}
-__declspec(naked) __declspec(align(16))
-void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int pix) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // shuffler
- movdqu xmm5, [ecx]
- mov ecx, [esp + 16] // pix
-
- align 4
- wloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- pshufb xmm0, xmm5
- pshufb xmm1, xmm5
- sub ecx, 8
- movdqu [edx], xmm0
- movdqu [edx + 16], xmm1
- lea edx, [edx + 32]
- jg wloop
- ret
- }
-}
-
#ifdef HAS_ARGBSHUFFLEROW_AVX2
__declspec(naked) __declspec(align(16))
void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
diff --git a/source/scale.cc b/source/scale.cc
index f37da7e..3083a83 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -154,9 +154,7 @@ static void ScalePlaneDown4(int src_width, int src_height,
ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
}
#elif defined(HAS_SCALEROWDOWN4_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(dst_width, 8) && IS_ALIGNED(row_stride, 16) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2;
}
#elif defined(HAS_SCALEROWDOWN4_MIPS_DSPR2)
@@ -198,9 +196,7 @@ static void ScalePlaneDown4_16(int src_width, int src_height,
ScaleRowDown4_16_NEON;
}
#elif defined(HAS_SCALEROWDOWN4_16_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(dst_width, 8) && IS_ALIGNED(row_stride, 16) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_SSE2 :
ScaleRowDown4_16_SSE2;
}
@@ -256,8 +252,7 @@ static void ScalePlaneDown34(int src_width, int src_height,
}
#endif
#if defined(HAS_SCALEROWDOWN34_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
if (!filtering) {
ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
@@ -336,8 +331,7 @@ static void ScalePlaneDown34_16(int src_width, int src_height,
}
#endif
#if defined(HAS_SCALEROWDOWN34_16_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
if (!filtering) {
ScaleRowDown34_0 = ScaleRowDown34_16_SSSE3;
ScaleRowDown34_1 = ScaleRowDown34_16_SSSE3;
@@ -431,8 +425,7 @@ static void ScalePlaneDown38(int src_width, int src_height,
}
}
#elif defined(HAS_SCALEROWDOWN38_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
if (!filtering) {
ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
@@ -508,8 +501,7 @@ static void ScalePlaneDown38_16(int src_width, int src_height,
}
}
#elif defined(HAS_SCALEROWDOWN38_16_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
if (!filtering) {
ScaleRowDown38_3 = ScaleRowDown38_16_SSSE3;
ScaleRowDown38_2 = ScaleRowDown38_16_SSSE3;
@@ -743,11 +735,11 @@ static void ScalePlaneBox(int src_width, int src_height,
uint16* dst_ptr, int src_width, int src_height) = ScaleAddRows_C;
#if defined(HAS_SCALEADDROWS_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
+ if (TestCpuFlag(kCpuHasSSE2)
#ifdef AVOID_OVERREAD
- IS_ALIGNED(src_width, 16) &&
+ && IS_ALIGNED(src_width, 16)
#endif
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+ ) {
ScaleAddRows = ScaleAddRows_SSE2;
}
#endif
@@ -815,11 +807,11 @@ static void ScalePlaneBox_16(int src_width, int src_height,
uint32* dst_ptr, int src_width, int src_height) = ScaleAddRows_16_C;
#if defined(HAS_SCALEADDROWS_16_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
+ if (TestCpuFlag(kCpuHasSSE2)
#ifdef AVOID_OVERREAD
- IS_ALIGNED(src_width, 16) &&
+ && IS_ALIGNED(src_width, 16)
#endif
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+ ) {
ScaleAddRows = ScaleAddRows_16_SSE2;
}
#endif
@@ -1111,9 +1103,7 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
ScaleFilterCols = ScaleColsUp2_C;
#if defined(HAS_SCALECOLS_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
ScaleFilterCols = ScaleColsUp2_SSE2;
}
#endif
@@ -1244,9 +1234,7 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
ScaleFilterCols = ScaleColsUp2_16_C;
#if defined(HAS_SCALECOLS_16_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
ScaleFilterCols = ScaleColsUp2_16_SSE2;
}
#endif
@@ -1327,9 +1315,7 @@ static void ScalePlaneSimple(int src_width, int src_height,
if (src_width * 2 == dst_width && x < 0x8000) {
ScaleCols = ScaleColsUp2_C;
#if defined(HAS_SCALECOLS_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
ScaleCols = ScaleColsUp2_SSE2;
}
#endif
@@ -1362,9 +1348,7 @@ static void ScalePlaneSimple_16(int src_width, int src_height,
if (src_width * 2 == dst_width && x < 0x8000) {
ScaleCols = ScaleColsUp2_16_C;
#if defined(HAS_SCALECOLS_16_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
ScaleCols = ScaleColsUp2_16_SSE2;
}
#endif
diff --git a/source/scale_argb.cc b/source/scale_argb.cc
index b6d5129..7e081ff 100644
--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
@@ -53,16 +53,13 @@ static void ScaleARGBDown2(int src_width, int src_height,
}
#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
ScaleARGBRowDown2Box_SSE2);
}
#elif defined(HAS_SCALEARGBROWDOWN2_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
- IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) {
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_NEON :
ScaleARGBRowDown2_NEON;
}
@@ -98,14 +95,11 @@ static void ScaleARGBDown4Box(int src_width, int src_height,
assert(dx == 65536 * 4); // Test scale factor of 4.
assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4.
#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
}
#elif defined(HAS_SCALEARGBROWDOWN2_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
- IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) {
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
}
#endif
@@ -139,14 +133,12 @@ static void ScaleARGBDownEven(int src_width, int src_height,
assert(IS_ALIGNED(src_height, 2));
src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
ScaleARGBRowDownEven_SSE2;
}
#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4) &&
- IS_ALIGNED(src_argb, 4)) {
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
ScaleARGBRowDownEven_NEON;
}
@@ -334,9 +326,7 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
ScaleARGBFilterCols = ScaleARGBColsUp2_C;
#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
}
#endif
@@ -510,9 +500,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
ScaleARGBFilterCols = ScaleARGBColsUp2_C;
#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
}
#endif
@@ -619,9 +607,7 @@ static void ScaleARGBSimple(int src_width, int src_height,
if (src_width * 2 == dst_width && x < 0x8000) {
ScaleARGBCols = ScaleARGBColsUp2_C;
#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
ScaleARGBCols = ScaleARGBColsUp2_SSE2;
}
#endif