aboutsummaryrefslogtreecommitdiff
path: root/src/dsp
diff options
context:
space:
mode:
Diffstat (limited to 'src/dsp')
-rw-r--r--src/dsp/arm/convolve_10bit_neon.cc62
-rw-r--r--src/dsp/arm/convolve_neon.cc12
-rw-r--r--src/dsp/arm/film_grain_neon.cc18
-rw-r--r--src/dsp/arm/intrapred_directional_neon.cc2
-rw-r--r--src/dsp/arm/inverse_transform_neon.cc9
-rw-r--r--src/dsp/arm/loop_filter_10bit_neon.cc2
-rw-r--r--src/dsp/arm/loop_restoration_10bit_neon.cc14
-rw-r--r--src/dsp/arm/loop_restoration_neon.cc6
-rw-r--r--src/dsp/average_blend_test.cc5
-rw-r--r--src/dsp/cdef_test.cc18
-rw-r--r--src/dsp/common_dsp_test.cc58
-rw-r--r--src/dsp/convolve_test.cc20
-rw-r--r--src/dsp/distance_weighted_blend_test.cc5
-rw-r--r--src/dsp/intra_edge_test.cc10
-rw-r--r--src/dsp/intrapred_cfl_test.cc10
-rw-r--r--src/dsp/intrapred_directional_test.cc5
-rw-r--r--src/dsp/intrapred_filter_test.cc5
-rw-r--r--src/dsp/intrapred_test.cc7
-rw-r--r--src/dsp/inverse_transform_test.cc5
-rw-r--r--src/dsp/loop_filter_test.cc5
-rw-r--r--src/dsp/loop_restoration_test.cc28
-rw-r--r--src/dsp/mask_blend_test.cc6
-rw-r--r--src/dsp/motion_field_projection_test.cc5
-rw-r--r--src/dsp/motion_vector_search_test.cc5
-rw-r--r--src/dsp/obmc_test.cc5
-rw-r--r--src/dsp/super_res_test.cc1
-rw-r--r--src/dsp/warp_test.cc1
-rw-r--r--src/dsp/weight_mask_test.cc1
-rw-r--r--src/dsp/x86/common_avx2_test.cc16
-rw-r--r--src/dsp/x86/common_avx2_test.h26
-rw-r--r--src/dsp/x86/common_sse4_test.cc16
-rw-r--r--src/dsp/x86/common_sse4_test.h26
-rw-r--r--src/dsp/x86/convolve_avx2.cc9
-rw-r--r--src/dsp/x86/convolve_sse4.cc13
-rw-r--r--src/dsp/x86/intrapred_directional_sse4.cc4
-rw-r--r--src/dsp/x86/loop_restoration_10bit_sse4.cc9
-rw-r--r--src/dsp/x86/loop_restoration_sse4.cc9
37 files changed, 297 insertions, 161 deletions
diff --git a/src/dsp/arm/convolve_10bit_neon.cc b/src/dsp/arm/convolve_10bit_neon.cc
index 389f029..1aa0cc7 100644
--- a/src/dsp/arm/convolve_10bit_neon.cc
+++ b/src/dsp/arm/convolve_10bit_neon.cc
@@ -412,30 +412,21 @@ void FilterHorizontal(const uint16_t* LIBGAV1_RESTRICT const src,
void* LIBGAV1_RESTRICT const dest,
const ptrdiff_t pred_stride, const int width,
const int height, const int16x4_t* const v_tap) {
- assert(width < 8 || num_taps != 4);
- // Don't simplify the redundant if conditions with the template parameters,
- // which helps the compiler generate compact code.
- if (width >= 8 && num_taps != 4) {
- FilterHorizontalWidth8AndUp<num_taps, is_compound, is_2d>(
- src, src_stride, dest, pred_stride, width, height, v_tap);
- return;
- }
-
// Horizontal passes only needs to account for number of taps 2 and 4 when
// |width| <= 4.
assert(width <= 4);
assert(num_taps == 2 || num_taps == 4);
if (num_taps == 2 || num_taps == 4) {
- if (width == 4) {
- FilterHorizontalWidth4<num_taps, is_compound, is_2d>(
- src, src_stride, dest, pred_stride, height, v_tap);
- return;
- }
- assert(width == 2);
- if (!is_compound) {
+ if (width == 2 && !is_compound) {
FilterHorizontalWidth2<num_taps, is_2d>(src, src_stride, dest,
pred_stride, height, v_tap);
+ return;
}
+ assert(width == 4);
+ FilterHorizontalWidth4<num_taps, is_compound, is_2d>(
+ src, src_stride, dest, pred_stride, height, v_tap);
+ } else {
+ assert(false);
}
}
@@ -454,19 +445,32 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
v_tap[k] = vdup_n_s16(kHalfSubPixelFilters[filter_index][filter_id][k]);
}
- if (filter_index == 2) { // 8 tap.
- FilterHorizontal<8, is_compound, is_2d>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
- } else if (filter_index < 2) { // 6 tap.
- FilterHorizontal<6, is_compound, is_2d>(src + 1, src_stride, dst,
- dst_stride, width, height, v_tap);
- } else if ((filter_index & 0x4) != 0) { // 4 tap.
- // ((filter_index == 4) | (filter_index == 5))
- FilterHorizontal<4, is_compound, is_2d>(src + 2, src_stride, dst,
- dst_stride, width, height, v_tap);
- } else { // 2 tap.
- FilterHorizontal<2, is_compound, is_2d>(src + 3, src_stride, dst,
- dst_stride, width, height, v_tap);
+ // Horizontal filter.
+ // Filter types used for width <= 4 are different from those for width > 4.
+ // When width > 4, the valid filter index range is always [0, 3].
+ // When width <= 4, the valid filter index range is always [4, 5].
+ if (width >= 8) {
+ if (filter_index == 2) { // 8 tap.
+ FilterHorizontalWidth8AndUp<8, is_compound, is_2d>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ } else if (filter_index < 2) { // 6 tap.
+ FilterHorizontalWidth8AndUp<6, is_compound, is_2d>(
+ src + 1, src_stride, dst, dst_stride, width, height, v_tap);
+ } else { // 2 tap.
+ assert(filter_index == 3);
+ FilterHorizontalWidth8AndUp<2, is_compound, is_2d>(
+ src + 3, src_stride, dst, dst_stride, width, height, v_tap);
+ }
+ } else {
+ if ((filter_index & 0x4) != 0) { // 4 tap.
+ // ((filter_index == 4) | (filter_index == 5))
+ FilterHorizontal<4, is_compound, is_2d>(src + 2, src_stride, dst,
+ dst_stride, width, height, v_tap);
+ } else { // 2 tap.
+ assert(filter_index == 3);
+ FilterHorizontal<2, is_compound, is_2d>(src + 3, src_stride, dst,
+ dst_stride, width, height, v_tap);
+ }
}
}
diff --git a/src/dsp/arm/convolve_neon.cc b/src/dsp/arm/convolve_neon.cc
index 5b80da2..97b3f26 100644
--- a/src/dsp/arm/convolve_neon.cc
+++ b/src/dsp/arm/convolve_neon.cc
@@ -371,16 +371,14 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT const src,
assert(width <= 4);
assert(filter_index >= 3 && filter_index <= 5);
if (filter_index >= 3 && filter_index <= 5) {
- if (width == 4) {
- FilterHorizontalWidth4<filter_index, is_2d, is_compound>(
- src, src_stride, dest, pred_stride, height, v_tap);
- return;
- }
- assert(width == 2);
- if (!is_compound) {
+ if (width == 2 && !is_compound) {
FilterHorizontalWidth2<filter_index, is_2d>(src, src_stride, dest,
pred_stride, height, v_tap);
+ return;
}
+ assert(width == 4);
+ FilterHorizontalWidth4<filter_index, is_2d, is_compound>(
+ src, src_stride, dest, pred_stride, height, v_tap);
}
}
diff --git a/src/dsp/arm/film_grain_neon.cc b/src/dsp/arm/film_grain_neon.cc
index 76e1151..cde887c 100644
--- a/src/dsp/arm/film_grain_neon.cc
+++ b/src/dsp/arm/film_grain_neon.cc
@@ -682,26 +682,14 @@ inline int16x8_t Clip3(const int16x8_t value, const int16x8_t low,
template <int bitdepth, typename Pixel>
inline int16x8_t GetScalingFactors(const int16_t scaling_lut[],
- const Pixel* source) {
+ const Pixel* source,
+ const int valid_range = 8) {
int16_t start_vals[8];
static_assert(bitdepth <= kBitdepth10,
"NEON Film Grain is not yet implemented for 12bpp.");
#if LIBGAV1_MSAN
- memset(start_vals, 0, sizeof(start_vals));
+ if (valid_range < 8) memset(start_vals, 0, sizeof(start_vals));
#endif
- for (int i = 0; i < 8; ++i) {
- assert(source[i] < (kScalingLookupTableSize << (bitdepth - kBitdepth8)));
- start_vals[i] = scaling_lut[source[i]];
- }
- return vld1q_s16(start_vals);
-}
-
-template <int bitdepth, typename Pixel>
-inline int16x8_t GetScalingFactors(const int16_t scaling_lut[],
- const Pixel* source, const int valid_range) {
- int16_t start_vals[8];
- static_assert(bitdepth <= kBitdepth10,
- "NEON Film Grain is not yet implemented for 12bpp.");
for (int i = 0; i < valid_range; ++i) {
assert(source[i] < (kScalingLookupTableSize << (bitdepth - kBitdepth8)));
start_vals[i] = scaling_lut[source[i]];
diff --git a/src/dsp/arm/intrapred_directional_neon.cc b/src/dsp/arm/intrapred_directional_neon.cc
index e9bdcf0..d36ef5f 100644
--- a/src/dsp/arm/intrapred_directional_neon.cc
+++ b/src/dsp/arm/intrapred_directional_neon.cc
@@ -1752,7 +1752,7 @@ inline void DirectionalZone2FromLeftCol_8x8(
const int index_scale_bits = 6;
// The values in |offset_y| are negative, except for the first element, which
// is zero.
- int16x8_t offset_y = left_y;
+ int16x8_t offset_y;
int16x8_t shift_upsampled = left_y;
// The shift argument must be a constant, otherwise use upsample_shift
// directly.
diff --git a/src/dsp/arm/inverse_transform_neon.cc b/src/dsp/arm/inverse_transform_neon.cc
index 452f14a..cc4e4a4 100644
--- a/src/dsp/arm/inverse_transform_neon.cc
+++ b/src/dsp/arm/inverse_transform_neon.cc
@@ -345,11 +345,12 @@ LIBGAV1_ALWAYS_INLINE void ButterflyRotation_FirstIsZero(int16x8_t* a,
int16x8_t* b,
const int angle,
const bool flip) {
+ // Clang < 14 targeting armv8.1-a+ optimizes vqrdmulhq_n_s16 and vqsubq_s16
+ // (in HadamardRotation) into vqrdmlshq_s16 resulting in an "off by one"
+ // error. This behavior was fixed in 14.0.0:
+ // https://github.com/llvm/llvm-project/commit/82973edfb72a95b442fa6d2bb404e15a4031855e
#if defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__) && \
- defined(__clang__) // ARM v8.1-A
- // Clang optimizes vqrdmulhq_n_s16 and vqsubq_s16 (in HadamardRotation) into
- // vqrdmlshq_s16 resulting in an "off by one" error. For now, do not use
- // vqrdmulhq_n_s16().
+ defined(__clang__) && __clang_major__ < 14
const int16_t cos128 = Cos128(angle);
const int16_t sin128 = Sin128(angle);
const int32x4_t x0 = vmull_n_s16(vget_low_s16(*b), -sin128);
diff --git a/src/dsp/arm/loop_filter_10bit_neon.cc b/src/dsp/arm/loop_filter_10bit_neon.cc
index a9dd98f..abdc074 100644
--- a/src/dsp/arm/loop_filter_10bit_neon.cc
+++ b/src/dsp/arm/loop_filter_10bit_neon.cc
@@ -444,7 +444,6 @@ void Horizontal6_NEON(void* const dest, const ptrdiff_t stride,
const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
if (vget_lane_u64(need_filter6, 0) == 0) {
// Filter6() does not apply, but Filter4() applies to one or more values.
- p0q0_output = p0q0;
p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
} else {
@@ -526,7 +525,6 @@ void Vertical6_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
if (vget_lane_u64(need_filter6, 0) == 0) {
// Filter6() does not apply, but Filter4() applies to one or more values.
- p0q0_output = p0q0;
p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
} else {
diff --git a/src/dsp/arm/loop_restoration_10bit_neon.cc b/src/dsp/arm/loop_restoration_10bit_neon.cc
index 410bc20..9191080 100644
--- a/src/dsp/arm/loop_restoration_10bit_neon.cc
+++ b/src/dsp/arm/loop_restoration_10bit_neon.cc
@@ -1130,7 +1130,13 @@ inline void LookupIntermediate(const uint16x8_t sum, const uint16x8_t index,
const uint8x8_t idx = vqmovn_u16(index);
uint8_t temp[8];
vst1_u8(temp, idx);
- *ma = vsetq_lane_u8(kSgrMaLookup[temp[0]], *ma, offset + 0);
+ // offset == 0 is assumed to be the first call to this function. The value is
+ // duplicated to avoid -Wuninitialized warnings under gcc.
+ if (offset == 0) {
+ *ma = vdupq_n_u8(kSgrMaLookup[temp[0]]);
+ } else {
+ *ma = vsetq_lane_u8(kSgrMaLookup[temp[0]], *ma, offset + 0);
+ }
*ma = vsetq_lane_u8(kSgrMaLookup[temp[1]], *ma, offset + 1);
*ma = vsetq_lane_u8(kSgrMaLookup[temp[2]], *ma, offset + 2);
*ma = vsetq_lane_u8(kSgrMaLookup[temp[3]], *ma, offset + 3);
@@ -1712,8 +1718,6 @@ LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
s[0] = Load1QMsanU16(src + 0, overread_in_bytes + 0);
s[1] = Load1QMsanU16(src + 8, overread_in_bytes + 16);
Square(s[0], sq);
- // Quiet "may be used uninitialized" warning.
- mas[0] = mas[1] = vdupq_n_u8(0);
BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq, &mas[0], bs);
int x = 0;
@@ -2067,8 +2071,6 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
s[0] = Load1QMsanU16(src0 + 0, overread_in_bytes + 0);
s[1] = Load1QMsanU16(src0 + 8, overread_in_bytes + 16);
Square(s[0], sq);
- // Quiet "may be used uninitialized" warning.
- mas[0] = mas[1] = vdupq_n_u8(0);
BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq, &mas[0], bs);
int x = 0;
@@ -2255,8 +2257,6 @@ inline void BoxFilterLastRow(
s[0] = Load1QMsanU16(src0 + 0, overread_in_bytes + 0);
s[1] = Load1QMsanU16(src0 + 8, overread_in_bytes + 16);
Square(s[0], sq);
- // Quiet "may be used uninitialized" warning.
- ma3[0] = ma3[1] = vdupq_n_u8(0);
BoxFilterPreProcessLastRowLo(s, scales, sum3, sum5, square_sum3, square_sum5,
sq, &ma3[0], &ma5[0], b3, b5);
diff --git a/src/dsp/arm/loop_restoration_neon.cc b/src/dsp/arm/loop_restoration_neon.cc
index cd8552e..adb8f36 100644
--- a/src/dsp/arm/loop_restoration_neon.cc
+++ b/src/dsp/arm/loop_restoration_neon.cc
@@ -1125,7 +1125,11 @@ inline void CalculateIntermediate(const uint16x8_t sum,
val = AdjustValue(val, idx, 101); // 101 is the last index which value is 3.
val = AdjustValue(val, idx, 169); // 169 is the last index which value is 2.
val = AdjustValue(val, idx, 254); // 254 is the last index which value is 1.
- *ma = (offset == 0) ? vcombine_u8(val, vget_high_u8(*ma))
+ // offset == 0 is assumed to be the first call to this function. Note
+ // vget_high_u8(*ma) is not used in this case to avoid a -Wuninitialized
+ // warning with some versions of gcc. vdup_n_u8(0) could work as well, but in
+ // most cases clang and gcc generated better code with this version.
+ *ma = (offset == 0) ? vcombine_u8(val, val)
: vcombine_u8(vget_low_u8(*ma), val);
// b = ma * b * one_over_n
diff --git a/src/dsp/average_blend_test.cc b/src/dsp/average_blend_test.cc
index 6d1100a..67d592f 100644
--- a/src/dsp/average_blend_test.cc
+++ b/src/dsp/average_blend_test.cc
@@ -76,9 +76,8 @@ class AverageBlendTest : public testing::TestWithParam<BlockSize>,
if (absl::StartsWith(test_case, "C/")) {
base_func_ = nullptr;
} else if (absl::StartsWith(test_case, "SSE41/")) {
- if ((GetCpuInfo() & kSSE4_1) != 0) {
- AverageBlendInit_SSE4_1();
- }
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ AverageBlendInit_SSE4_1();
} else if (absl::StartsWith(test_case, "NEON/")) {
AverageBlendInit_NEON();
} else {
diff --git a/src/dsp/cdef_test.cc b/src/dsp/cdef_test.cc
index c25d7df..e2db17a 100644
--- a/src/dsp/cdef_test.cc
+++ b/src/dsp/cdef_test.cc
@@ -79,11 +79,11 @@ class CdefDirectionTest : public testing::TestWithParam<int> {
const char* const test_case = test_info->test_suite_name();
if (absl::StartsWith(test_case, "C/")) {
} else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
CdefInit_SSE4_1();
} else if (absl::StartsWith(test_case, "AVX2/")) {
- if ((GetCpuInfo() & kAVX2) != 0) {
- CdefInit_AVX2();
- }
+ if ((GetCpuInfo() & kAVX2) == 0) GTEST_SKIP() << "No AVX2 support!";
+ CdefInit_AVX2();
} else if (absl::StartsWith(test_case, "NEON/")) {
CdefInit_NEON();
} else {
@@ -275,11 +275,11 @@ class CdefFilteringTest : public testing::TestWithParam<CdefTestParam> {
} else if (absl::StartsWith(test_case, "NEON/")) {
CdefInit_NEON();
} else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
CdefInit_SSE4_1();
} else if (absl::StartsWith(test_case, "AVX2/")) {
- if ((GetCpuInfo() & kAVX2) != 0) {
- CdefInit_AVX2();
- }
+ if ((GetCpuInfo() & kAVX2) == 0) GTEST_SKIP() << "No AVX2 support!";
+ CdefInit_AVX2();
} else {
FAIL() << "Unrecognized architecture prefix in test case name: "
<< test_case;
@@ -304,7 +304,7 @@ template <int bitdepth, typename Pixel>
void CdefFilteringTest<bitdepth, Pixel>::TestRandomValues(int num_runs) {
const int id = static_cast<int>(param_.rows4x4 < 4) * 3 +
(param_.subsampling_x + param_.subsampling_y) * 6;
- absl::Duration elapsed_time;
+ absl::Duration elapsed_time[kMaxPlanes];
for (int num_tests = 0; num_tests < num_runs; ++num_tests) {
for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
const int subsampling_x = (plane == kPlaneY) ? 0 : param_.subsampling_x;
@@ -355,7 +355,7 @@ void CdefFilteringTest<bitdepth, Pixel>::TestRandomValues(int num_runs) {
source_ + offset, kSourceStride, block_height, primary_strength_,
secondary_strength_, damping_, direction_, dest_[plane],
kTestBufferStride * sizeof(dest_[0][0]));
- elapsed_time += absl::Now() - start;
+ elapsed_time[plane] += absl::Now() - start;
}
}
@@ -379,7 +379,7 @@ void CdefFilteringTest<bitdepth, Pixel>::TestRandomValues(int num_runs) {
ASSERT_NE(expected_digest, nullptr);
test_utils::CheckMd5Digest(kCdef, kCdefFilterName, expected_digest,
reinterpret_cast<uint8_t*>(dest_[plane]),
- sizeof(dest_[plane]), elapsed_time);
+ sizeof(dest_[plane]), elapsed_time[plane]);
}
}
diff --git a/src/dsp/common_dsp_test.cc b/src/dsp/common_dsp_test.cc
new file mode 100644
index 0000000..3342ce8
--- /dev/null
+++ b/src/dsp/common_dsp_test.cc
@@ -0,0 +1,58 @@
+// Copyright 2023 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "absl/strings/match.h"
+#include "gtest/gtest.h"
+#include "src/dsp/x86/common_avx2_test.h"
+#include "src/dsp/x86/common_sse4_test.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+class CommonDspTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->name();
+ if (absl::StartsWith(test_case, "SSE41")) {
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ } else if (absl::StartsWith(test_case, "AVX2")) {
+ if ((GetCpuInfo() & kAVX2) == 0) GTEST_SKIP() << "No AVX2 support!";
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+ }
+};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CommonDspTest);
+
+#if LIBGAV1_ENABLE_AVX2
+TEST_F(CommonDspTest, AVX2RightShiftWithRoundingS16) {
+ AVX2RightShiftWithRoundingS16Test();
+}
+#endif // LIBGAV1_ENABLE_AVX2
+
+#if LIBGAV1_ENABLE_SSE4_1
+TEST_F(CommonDspTest, SSE41RightShiftWithRoundingS16) {
+ SSE41RightShiftWithRoundingS16Test();
+}
+#endif // LIBGAV1_ENABLE_SSE41
+
+} // namespace
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/convolve_test.cc b/src/dsp/convolve_test.cc
index 42cdeb7..b8c1f1d 100644
--- a/src/dsp/convolve_test.cc
+++ b/src/dsp/convolve_test.cc
@@ -624,13 +624,11 @@ class ConvolveTest : public testing::TestWithParam<
if (absl::StartsWith(test_case, "C/")) {
base_convolve_func_ = nullptr;
} else if (absl::StartsWith(test_case, "SSE41/")) {
- if ((GetCpuInfo() & kSSE4_1) != 0) {
- ConvolveInit_SSE4_1();
- }
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ ConvolveInit_SSE4_1();
} else if (absl::StartsWith(test_case, "AVX2/")) {
- if ((GetCpuInfo() & kAVX2) != 0) {
- ConvolveInit_AVX2();
- }
+ if ((GetCpuInfo() & kAVX2) == 0) GTEST_SKIP() << "No AVX2 support!";
+ ConvolveInit_AVX2();
} else if (absl::StartsWith(test_case, "NEON/")) {
ConvolveInit_NEON();
#if LIBGAV1_MAX_BITDEPTH >= 10
@@ -1084,13 +1082,11 @@ class ConvolveScaleTest
if (absl::StartsWith(test_case, "C/")) {
base_convolve_scale_func_ = nullptr;
} else if (absl::StartsWith(test_case, "SSE41/")) {
- if ((GetCpuInfo() & kSSE4_1) != 0) {
- ConvolveInit_SSE4_1();
- }
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ ConvolveInit_SSE4_1();
} else if (absl::StartsWith(test_case, "AVX2/")) {
- if ((GetCpuInfo() & kAVX2) != 0) {
- ConvolveInit_AVX2();
- }
+ if ((GetCpuInfo() & kAVX2) == 0) GTEST_SKIP() << "No AVX2 support!";
+ ConvolveInit_AVX2();
} else if (absl::StartsWith(test_case, "NEON/")) {
ConvolveInit_NEON();
#if LIBGAV1_MAX_BITDEPTH >= 10
diff --git a/src/dsp/distance_weighted_blend_test.cc b/src/dsp/distance_weighted_blend_test.cc
index 88040b4..0d6e1cd 100644
--- a/src/dsp/distance_weighted_blend_test.cc
+++ b/src/dsp/distance_weighted_blend_test.cc
@@ -63,9 +63,8 @@ class DistanceWeightedBlendTest : public testing::TestWithParam<BlockSize>,
if (absl::StartsWith(test_case, "C/")) {
base_func_ = nullptr;
} else if (absl::StartsWith(test_case, "SSE41/")) {
- if ((GetCpuInfo() & kSSE4_1) != 0) {
- DistanceWeightedBlendInit_SSE4_1();
- }
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ DistanceWeightedBlendInit_SSE4_1();
} else if (absl::StartsWith(test_case, "NEON/")) {
DistanceWeightedBlendInit_NEON();
} else {
diff --git a/src/dsp/intra_edge_test.cc b/src/dsp/intra_edge_test.cc
index b287544..75c45be 100644
--- a/src/dsp/intra_edge_test.cc
+++ b/src/dsp/intra_edge_test.cc
@@ -97,9 +97,8 @@ class IntraEdgeFilterTest : public testing::TestWithParam<EdgeFilterParams> {
if (absl::StartsWith(test_case, "C/")) {
base_intra_edge_filter_ = nullptr;
} else if (absl::StartsWith(test_case, "SSE41/")) {
- if ((GetCpuInfo() & kSSE4_1) != 0) {
- IntraEdgeInit_SSE4_1();
- }
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ IntraEdgeInit_SSE4_1();
} else if (absl::StartsWith(test_case, "NEON/")) {
IntraEdgeInit_NEON();
} else {
@@ -356,9 +355,8 @@ class IntraEdgeUpsamplerTest : public testing::TestWithParam<int> {
if (absl::StartsWith(test_case, "C/")) {
base_intra_edge_upsampler_ = nullptr;
} else if (absl::StartsWith(test_case, "SSE41/")) {
- if ((GetCpuInfo() & kSSE4_1) != 0) {
- IntraEdgeInit_SSE4_1();
- }
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ IntraEdgeInit_SSE4_1();
} else if (absl::StartsWith(test_case, "NEON/")) {
IntraEdgeInit_NEON();
} else {
diff --git a/src/dsp/intrapred_cfl_test.cc b/src/dsp/intrapred_cfl_test.cc
index 8415d51..53f3075 100644
--- a/src/dsp/intrapred_cfl_test.cc
+++ b/src/dsp/intrapred_cfl_test.cc
@@ -156,9 +156,8 @@ class CflIntraPredTest : public IntraPredTestBase<bitdepth, Pixel> {
} else if (absl::StartsWith(test_case, "NEON/")) {
IntraPredCflInit_NEON();
} else if (absl::StartsWith(test_case, "SSE41/")) {
- if ((GetCpuInfo() & kSSE4_1) != 0) {
- IntraPredCflInit_SSE4_1();
- }
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ IntraPredCflInit_SSE4_1();
} else {
FAIL() << "Unrecognized architecture prefix in test case name: "
<< test_case;
@@ -304,9 +303,8 @@ class CflSubsamplerTest : public IntraPredTestBase<bitdepth, Pixel> {
} else if (absl::StartsWith(test_case, "NEON/")) {
IntraPredCflInit_NEON();
} else if (absl::StartsWith(test_case, "SSE41/")) {
- if ((GetCpuInfo() & kSSE4_1) != 0) {
- IntraPredCflInit_SSE4_1();
- }
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ IntraPredCflInit_SSE4_1();
} else {
FAIL() << "Unrecognized architecture prefix in test case name: "
<< test_case;
diff --git a/src/dsp/intrapred_directional_test.cc b/src/dsp/intrapred_directional_test.cc
index 8d4fa63..2c81b27 100644
--- a/src/dsp/intrapred_directional_test.cc
+++ b/src/dsp/intrapred_directional_test.cc
@@ -187,9 +187,8 @@ class DirectionalIntraPredTest : public IntraPredTestBase<bitdepth, Pixel> {
} else if (absl::StartsWith(test_case, "NEON/")) {
IntraPredDirectionalInit_NEON();
} else if (absl::StartsWith(test_case, "SSE41/")) {
- if ((GetCpuInfo() & kSSE4_1) != 0) {
- IntraPredDirectionalInit_SSE4_1();
- }
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ IntraPredDirectionalInit_SSE4_1();
} else {
FAIL() << "Unrecognized architecture prefix in test case name: "
<< test_case;
diff --git a/src/dsp/intrapred_filter_test.cc b/src/dsp/intrapred_filter_test.cc
index c8d60a0..d5694f6 100644
--- a/src/dsp/intrapred_filter_test.cc
+++ b/src/dsp/intrapred_filter_test.cc
@@ -158,9 +158,8 @@ class FilterIntraPredTest : public IntraPredTestBase<bitdepth, Pixel> {
// No need to compare C with itself.
base_filter_intra_pred_ = nullptr;
} else if (absl::StartsWith(test_case, "SSE41/")) {
- if ((GetCpuInfo() & kSSE4_1) != 0) {
- IntraPredFilterInit_SSE4_1();
- }
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ IntraPredFilterInit_SSE4_1();
} else if (absl::StartsWith(test_case, "NEON/")) {
IntraPredFilterInit_NEON();
} else {
diff --git a/src/dsp/intrapred_test.cc b/src/dsp/intrapred_test.cc
index cca1c73..5753817 100644
--- a/src/dsp/intrapred_test.cc
+++ b/src/dsp/intrapred_test.cc
@@ -154,10 +154,9 @@ class IntraPredTest : public IntraPredTestBase<bitdepth, Pixel> {
if (absl::StartsWith(test_case, "C/")) {
memset(base_intrapreds_, 0, sizeof(base_intrapreds_));
} else if (absl::StartsWith(test_case, "SSE41/")) {
- if ((GetCpuInfo() & kSSE4_1) != 0) {
- IntraPredInit_SSE4_1();
- IntraPredSmoothInit_SSE4_1();
- }
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ IntraPredInit_SSE4_1();
+ IntraPredSmoothInit_SSE4_1();
} else if (absl::StartsWith(test_case, "NEON/")) {
IntraPredInit_NEON();
IntraPredSmoothInit_NEON();
diff --git a/src/dsp/inverse_transform_test.cc b/src/dsp/inverse_transform_test.cc
index 081dcc1..d74a33a 100644
--- a/src/dsp/inverse_transform_test.cc
+++ b/src/dsp/inverse_transform_test.cc
@@ -181,9 +181,8 @@ class InverseTransformTest
if (absl::StartsWith(test_case, "C/")) {
memset(base_inverse_transforms_, 0, sizeof(base_inverse_transforms_));
} else if (absl::StartsWith(test_case, "SSE41/")) {
- if ((GetCpuInfo() & kSSE4_1) != 0) {
- InverseTransformInit_SSE4_1();
- }
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ InverseTransformInit_SSE4_1();
} else if (absl::StartsWith(test_case, "NEON/")) {
InverseTransformInit_NEON();
InverseTransformInit10bpp_NEON();
diff --git a/src/dsp/loop_filter_test.cc b/src/dsp/loop_filter_test.cc
index 63ed530..93a273a 100644
--- a/src/dsp/loop_filter_test.cc
+++ b/src/dsp/loop_filter_test.cc
@@ -128,9 +128,8 @@ class LoopFilterTest : public testing::TestWithParam<LoopFilterSize> {
if (absl::StartsWith(test_case, "C/")) {
memset(base_loop_filters_, 0, sizeof(base_loop_filters_));
} else if (absl::StartsWith(test_case, "SSE41/")) {
- if ((GetCpuInfo() & kSSE4_1) != 0) {
- LoopFilterInit_SSE4_1();
- }
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ LoopFilterInit_SSE4_1();
} else if (absl::StartsWith(test_case, "NEON/")) {
LoopFilterInit_NEON();
#if LIBGAV1_MAX_BITDEPTH >= 10
diff --git a/src/dsp/loop_restoration_test.cc b/src/dsp/loop_restoration_test.cc
index 5c645b8..d6dcd9c 100644
--- a/src/dsp/loop_restoration_test.cc
+++ b/src/dsp/loop_restoration_test.cc
@@ -69,19 +69,17 @@ class SelfGuidedFilterTest : public testing::TestWithParam<int>,
const char* const test_case = test_info->test_suite_name();
if (absl::StartsWith(test_case, "C/")) {
} else if (absl::StartsWith(test_case, "AVX2/")) {
- if ((GetCpuInfo() & kAVX2) != 0) {
- LoopRestorationInit_AVX2();
+ if ((GetCpuInfo() & kAVX2) == 0) GTEST_SKIP() << "No AVX2 support!";
+ LoopRestorationInit_AVX2();
#if LIBGAV1_MAX_BITDEPTH >= 10
- LoopRestorationInit10bpp_AVX2();
+ LoopRestorationInit10bpp_AVX2();
#endif
- }
} else if (absl::StartsWith(test_case, "SSE41/")) {
- if ((GetCpuInfo() & kSSE4_1) != 0) {
- LoopRestorationInit_SSE4_1();
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ LoopRestorationInit_SSE4_1();
#if LIBGAV1_MAX_BITDEPTH >= 10
- LoopRestorationInit10bpp_SSE4_1();
+ LoopRestorationInit10bpp_SSE4_1();
#endif
- }
} else if (absl::StartsWith(test_case, "NEON/")) {
LoopRestorationInit_NEON();
#if LIBGAV1_MAX_BITDEPTH >= 10
@@ -381,19 +379,17 @@ class WienerFilterTest : public testing::TestWithParam<int>,
const char* const test_case = test_info->test_suite_name();
if (absl::StartsWith(test_case, "C/")) {
} else if (absl::StartsWith(test_case, "AVX2/")) {
- if ((GetCpuInfo() & kAVX2) != 0) {
- LoopRestorationInit_AVX2();
+ if ((GetCpuInfo() & kAVX2) == 0) GTEST_SKIP() << "No AVX2 support!";
+ LoopRestorationInit_AVX2();
#if LIBGAV1_MAX_BITDEPTH >= 10
- LoopRestorationInit10bpp_AVX2();
+ LoopRestorationInit10bpp_AVX2();
#endif
- }
} else if (absl::StartsWith(test_case, "SSE41/")) {
- if ((GetCpuInfo() & kSSE4_1) != 0) {
- LoopRestorationInit_SSE4_1();
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ LoopRestorationInit_SSE4_1();
#if LIBGAV1_MAX_BITDEPTH >= 10
- LoopRestorationInit10bpp_SSE4_1();
+ LoopRestorationInit10bpp_SSE4_1();
#endif
- }
} else if (absl::StartsWith(test_case, "NEON/")) {
LoopRestorationInit_NEON();
#if LIBGAV1_MAX_BITDEPTH >= 10
diff --git a/src/dsp/mask_blend_test.cc b/src/dsp/mask_blend_test.cc
index 29dd43b..06793e5 100644
--- a/src/dsp/mask_blend_test.cc
+++ b/src/dsp/mask_blend_test.cc
@@ -270,9 +270,8 @@ class MaskBlendTest : public testing::TestWithParam<MaskBlendTestParam>,
} else if (absl::StartsWith(test_case, "NEON/")) {
MaskBlendInit_NEON();
} else if (absl::StartsWith(test_case, "SSE41/")) {
- if ((GetCpuInfo() & kSSE4_1) != 0) {
- MaskBlendInit_SSE4_1();
- }
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ MaskBlendInit_SSE4_1();
} else {
FAIL() << "Unrecognized architecture prefix in test case name: "
<< test_case;
@@ -427,6 +426,7 @@ void MaskBlendTest<bitdepth, Pixel>::Test(const char* const digest,
if (bitdepth != 8) {
ASSERT_EQ(func_8bpp_, nullptr);
}
+ ASSERT_NE(func_, nullptr);
func_(source1_, source2_, src_2_stride, mask_, mask_stride, width, height,
dest_, kDestStride);
}
diff --git a/src/dsp/motion_field_projection_test.cc b/src/dsp/motion_field_projection_test.cc
index 3a47cc7..8a57696 100644
--- a/src/dsp/motion_field_projection_test.cc
+++ b/src/dsp/motion_field_projection_test.cc
@@ -63,9 +63,8 @@ class MotionFieldProjectionTest : public testing::TestWithParam<int> {
} else if (absl::StartsWith(test_case, "NEON/")) {
MotionFieldProjectionInit_NEON();
} else if (absl::StartsWith(test_case, "SSE41/")) {
- if ((GetCpuInfo() & kSSE4_1) != 0) {
- MotionFieldProjectionInit_SSE4_1();
- }
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ MotionFieldProjectionInit_SSE4_1();
} else {
FAIL() << "Unrecognized architecture prefix in test case name: "
<< test_case;
diff --git a/src/dsp/motion_vector_search_test.cc b/src/dsp/motion_vector_search_test.cc
index a7b2ec8..5c680d6 100644
--- a/src/dsp/motion_vector_search_test.cc
+++ b/src/dsp/motion_vector_search_test.cc
@@ -55,9 +55,8 @@ class MotionVectorSearchTest : public testing::TestWithParam<int>,
} else if (absl::StartsWith(test_case, "NEON/")) {
MotionVectorSearchInit_NEON();
} else if (absl::StartsWith(test_case, "SSE41/")) {
- if ((GetCpuInfo() & kSSE4_1) != 0) {
- MotionVectorSearchInit_SSE4_1();
- }
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ MotionVectorSearchInit_SSE4_1();
} else {
FAIL() << "Unrecognized architecture prefix in test case name: "
<< test_case;
diff --git a/src/dsp/obmc_test.cc b/src/dsp/obmc_test.cc
index a10feb2..289fd66 100644
--- a/src/dsp/obmc_test.cc
+++ b/src/dsp/obmc_test.cc
@@ -193,9 +193,8 @@ class ObmcBlendTest : public testing::TestWithParam<ObmcTestParam> {
const absl::string_view test_case = test_info->test_suite_name();
if (absl::StartsWith(test_case, "C/")) {
} else if (absl::StartsWith(test_case, "SSE41/")) {
- if ((GetCpuInfo() & kSSE4_1) != 0) {
- ObmcInit_SSE4_1();
- }
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ ObmcInit_SSE4_1();
} else if (absl::StartsWith(test_case, "NEON/")) {
ObmcInit_NEON();
} else {
diff --git a/src/dsp/super_res_test.cc b/src/dsp/super_res_test.cc
index 7b253ff..0c3537c 100644
--- a/src/dsp/super_res_test.cc
+++ b/src/dsp/super_res_test.cc
@@ -96,6 +96,7 @@ class SuperResTest : public testing::TestWithParam<SuperResTestParam>,
} else if (absl::StartsWith(test_case, "NEON/")) {
SuperResInit_NEON();
} else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
SuperResInit_SSE4_1();
} else {
FAIL() << "Unrecognized architecture prefix in test case name: "
diff --git a/src/dsp/warp_test.cc b/src/dsp/warp_test.cc
index c64c8d6..f93ad8b 100644
--- a/src/dsp/warp_test.cc
+++ b/src/dsp/warp_test.cc
@@ -275,6 +275,7 @@ class WarpTest : public testing::TestWithParam<WarpTestParam> {
} else if (absl::StartsWith(test_case, "NEON/")) {
WarpInit_NEON();
} else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
WarpInit_SSE4_1();
} else {
FAIL() << "Unrecognized architecture prefix in test case name: "
diff --git a/src/dsp/weight_mask_test.cc b/src/dsp/weight_mask_test.cc
index 74ec03c..a080ec4 100644
--- a/src/dsp/weight_mask_test.cc
+++ b/src/dsp/weight_mask_test.cc
@@ -223,6 +223,7 @@ class WeightMaskTest : public testing::TestWithParam<WeightMaskTestParam>,
} else if (absl::StartsWith(test_case, "NEON/")) {
WeightMaskInit_NEON();
} else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
WeightMaskInit_SSE4_1();
}
func_ = dsp->weight_mask[width_index][height_index][mask_is_inverse_];
diff --git a/src/dsp/x86/common_avx2_test.cc b/src/dsp/x86/common_avx2_test.cc
index 2062683..4b294b0 100644
--- a/src/dsp/x86/common_avx2_test.cc
+++ b/src/dsp/x86/common_avx2_test.cc
@@ -12,26 +12,27 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "src/dsp/x86/common_avx2.h"
+#include "src/dsp/x86/common_avx2_test.h"
#include "gtest/gtest.h"
+#include "src/utils/cpu.h"
#if LIBGAV1_TARGETING_AVX2
#include <cstdint>
+#include "src/dsp/x86/common_avx2.h"
#include "src/utils/common.h"
namespace libgav1 {
namespace dsp {
-namespace {
// Show that RightShiftWithRounding_S16() is equal to
// RightShiftWithRounding() only for values less than or equal to
// INT16_MAX - ((1 << bits) >> 1). In particular, if bits == 16, then
// RightShiftWithRounding_S16() is equal to RightShiftWithRounding() only for
// negative values.
-TEST(CommonDspTest, AVX2RightShiftWithRoundingS16) {
+void AVX2RightShiftWithRoundingS16Test() {
for (int bits = 0; bits < 16; ++bits) {
const int bias = (1 << bits) >> 1;
for (int32_t value = INT16_MIN; value <= INT16_MAX; ++value) {
@@ -53,15 +54,20 @@ TEST(CommonDspTest, AVX2RightShiftWithRoundingS16) {
}
}
-} // namespace
} // namespace dsp
} // namespace libgav1
#else // !LIBGAV1_TARGETING_AVX2
-TEST(CommonDspTest, AVX2) {
+namespace libgav1 {
+namespace dsp {
+
+void AVX2RightShiftWithRoundingS16Test() {
GTEST_SKIP() << "Build this module for x86(-64) with AVX2 enabled to enable "
"the tests.";
}
+} // namespace dsp
+} // namespace libgav1
+
#endif // LIBGAV1_TARGETING_AVX2
diff --git a/src/dsp/x86/common_avx2_test.h b/src/dsp/x86/common_avx2_test.h
new file mode 100644
index 0000000..1124f7f
--- /dev/null
+++ b/src/dsp/x86/common_avx2_test.h
@@ -0,0 +1,26 @@
+// Copyright 2023 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef LIBGAV1_SRC_DSP_X86_COMMON_AVX2_TEST_H_
+#define LIBGAV1_SRC_DSP_X86_COMMON_AVX2_TEST_H_
+
+namespace libgav1 {
+namespace dsp {
+
+void AVX2RightShiftWithRoundingS16Test();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_X86_COMMON_AVX2_TEST_H_
diff --git a/src/dsp/x86/common_sse4_test.cc b/src/dsp/x86/common_sse4_test.cc
index 3288cfc..592630c 100644
--- a/src/dsp/x86/common_sse4_test.cc
+++ b/src/dsp/x86/common_sse4_test.cc
@@ -12,26 +12,27 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/common_sse4_test.h"
#include "gtest/gtest.h"
+#include "src/utils/cpu.h"
#if LIBGAV1_TARGETING_SSE4_1
#include <cstdint>
+#include "src/dsp/x86/common_sse4.h"
#include "src/utils/common.h"
namespace libgav1 {
namespace dsp {
-namespace {
// Show that RightShiftWithRounding_S16() is equal to
// RightShiftWithRounding() only for values less than or equal to
// INT16_MAX - ((1 << bits) >> 1). In particular, if bits == 16, then
// RightShiftWithRounding_S16() is equal to RightShiftWithRounding() only for
// negative values.
-TEST(CommonDspTest, SSE41RightShiftWithRoundingS16) {
+void SSE41RightShiftWithRoundingS16Test() {
for (int bits = 0; bits < 16; ++bits) {
const int bias = (1 << bits) >> 1;
for (int32_t value = INT16_MIN; value <= INT16_MAX; ++value) {
@@ -50,15 +51,20 @@ TEST(CommonDspTest, SSE41RightShiftWithRoundingS16) {
}
}
-} // namespace
} // namespace dsp
} // namespace libgav1
#else // !LIBGAV1_TARGETING_SSE4_1
-TEST(CommonDspTest, SSE41) {
+namespace libgav1 {
+namespace dsp {
+
+void SSE41RightShiftWithRoundingS16Test() {
GTEST_SKIP() << "Build this module for x86(-64) with SSE4 enabled to enable "
"the tests.";
}
+} // namespace dsp
+} // namespace libgav1
+
#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/common_sse4_test.h b/src/dsp/x86/common_sse4_test.h
new file mode 100644
index 0000000..169439a
--- /dev/null
+++ b/src/dsp/x86/common_sse4_test.h
@@ -0,0 +1,26 @@
+// Copyright 2023 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef LIBGAV1_SRC_DSP_X86_COMMON_SSE4_TEST_H_
+#define LIBGAV1_SRC_DSP_X86_COMMON_SSE4_TEST_H_
+
+namespace libgav1 {
+namespace dsp {
+
+void SSE41RightShiftWithRoundingS16Test();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_X86_COMMON_SSE4_TEST_H_
diff --git a/src/dsp/x86/convolve_avx2.cc b/src/dsp/x86/convolve_avx2.cc
index 6e94347..ff51aee 100644
--- a/src/dsp/x86/convolve_avx2.cc
+++ b/src/dsp/x86/convolve_avx2.cc
@@ -27,6 +27,7 @@
#include "src/dsp/dsp.h"
#include "src/dsp/x86/common_avx2.h"
#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
#include "src/utils/constants.h"
namespace libgav1 {
@@ -607,6 +608,10 @@ void Convolve2D_AVX2(const void* LIBGAV1_RESTRICT const reference,
alignas(32) uint16_t
intermediate_result[kMaxSuperBlockSizeInPixels *
(kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+#if LIBGAV1_MSAN
+ // Quiet msan warnings. Set with random non-zero value to aid in debugging.
+ memset(intermediate_result, 0x33, sizeof(intermediate_result));
+#endif
const int intermediate_height = height + vertical_taps - 1;
const ptrdiff_t src_stride = reference_stride;
@@ -1374,6 +1379,10 @@ void ConvolveCompound2D_AVX2(
alignas(32) uint16_t
intermediate_result[kMaxSuperBlockSizeInPixels *
(kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+#if LIBGAV1_MSAN
+ // Quiet msan warnings. Set with random non-zero value to aid in debugging.
+ memset(intermediate_result, 0x33, sizeof(intermediate_result));
+#endif
const int intermediate_height = height + vertical_taps - 1;
const ptrdiff_t src_stride = reference_stride;
diff --git a/src/dsp/x86/convolve_sse4.cc b/src/dsp/x86/convolve_sse4.cc
index f427c4c..99b87d6 100644
--- a/src/dsp/x86/convolve_sse4.cc
+++ b/src/dsp/x86/convolve_sse4.cc
@@ -28,6 +28,7 @@
#include "src/dsp/dsp.h"
#include "src/dsp/x86/common_sse4.h"
#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
namespace libgav1 {
namespace dsp {
@@ -254,6 +255,10 @@ void Convolve2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
alignas(16) uint16_t
intermediate_result[kMaxSuperBlockSizeInPixels *
(kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+#if LIBGAV1_MSAN
+ // Quiet msan warnings. Set with random non-zero value to aid in debugging.
+ memset(intermediate_result, 0x33, sizeof(intermediate_result));
+#endif
const int intermediate_height = height + vertical_taps - 1;
const ptrdiff_t src_stride = reference_stride;
@@ -617,6 +622,10 @@ void ConvolveCompound2D_SSE4_1(
alignas(16) uint16_t
intermediate_result[kMaxSuperBlockSizeInPixels *
(kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+#if LIBGAV1_MSAN
+ // Quiet msan warnings. Set with random non-zero value to aid in debugging.
+ memset(intermediate_result, 0x33, sizeof(intermediate_result));
+#endif
// Horizontal filter.
// Filter types used for width <= 4 are different from those for width > 4.
@@ -1157,6 +1166,10 @@ void ConvolveScale2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
alignas(16) int16_t
intermediate_result[kIntermediateAllocWidth *
(2 * kIntermediateAllocWidth + kSubPixelTaps)];
+#if LIBGAV1_MSAN
+ // Quiet msan warnings. Set with random non-zero value to aid in debugging.
+ memset(intermediate_result, 0x44, sizeof(intermediate_result));
+#endif
const int num_vert_taps = dsp::GetNumTapsInFilter(vert_filter_index);
const int intermediate_height =
(((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
diff --git a/src/dsp/x86/intrapred_directional_sse4.cc b/src/dsp/x86/intrapred_directional_sse4.cc
index bc61745..2e64d21 100644
--- a/src/dsp/x86/intrapred_directional_sse4.cc
+++ b/src/dsp/x86/intrapred_directional_sse4.cc
@@ -1023,6 +1023,10 @@ void DirectionalIntraPredictorZone2_SSE4_1(void* const dest, ptrdiff_t stride,
uint8_t left_buffer[288];
memcpy(top_buffer + 128, static_cast<const uint8_t*>(top_row) - 16, 160);
memcpy(left_buffer + 128, static_cast<const uint8_t*>(left_column) - 16, 160);
+#if LIBGAV1_MSAN
+ memset(top_buffer, 0x33, 128);
+ memset(left_buffer, 0x44, 128);
+#endif
const uint8_t* top_ptr = top_buffer + 144;
const uint8_t* left_ptr = left_buffer + 144;
if (width == 4 || height == 4) {
diff --git a/src/dsp/x86/loop_restoration_10bit_sse4.cc b/src/dsp/x86/loop_restoration_10bit_sse4.cc
index 6625d51..029e168 100644
--- a/src/dsp/x86/loop_restoration_10bit_sse4.cc
+++ b/src/dsp/x86/loop_restoration_10bit_sse4.cc
@@ -1079,7 +1079,14 @@ inline void LookupIntermediate(const __m128i sum, const __m128i index,
// general-purpose register to process. Faster than using _mm_extract_epi8().
uint8_t temp[8];
StoreLo8(temp, idx);
- *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[0]], offset + 0);
+ // offset == 0 is assumed to be the first call to this function. The value is
+ // mov'd to avoid -Wuninitialized warnings under gcc. mov should at least
+ // equivalent if not faster than pinsrb.
+ if (offset == 0) {
+ *ma = _mm_cvtsi32_si128(kSgrMaLookup[temp[0]]);
+ } else {
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[0]], offset + 0);
+ }
*ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], offset + 1);
*ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], offset + 2);
*ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], offset + 3);
diff --git a/src/dsp/x86/loop_restoration_sse4.cc b/src/dsp/x86/loop_restoration_sse4.cc
index b4df072..8c24c39 100644
--- a/src/dsp/x86/loop_restoration_sse4.cc
+++ b/src/dsp/x86/loop_restoration_sse4.cc
@@ -1222,7 +1222,14 @@ inline void LookupIntermediate(const __m128i sum, const __m128i index,
// general-purpose register to process. Faster than using _mm_extract_epi8().
uint8_t temp[8];
StoreLo8(temp, idx);
- *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[0]], offset + 0);
+ // offset == 0 is assumed to be the first call to this function. The value is
+ // mov'd to avoid -Wuninitialized warnings under gcc. mov should at least
+ // equivalent if not faster than pinsrb.
+ if (offset == 0) {
+ *ma = _mm_cvtsi32_si128(kSgrMaLookup[temp[0]]);
+ } else {
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[0]], offset + 0);
+ }
*ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], offset + 1);
*ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], offset + 2);
*ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], offset + 3);