// Copyright 2019 The libgav1 Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "src/dsp/intrapred_directional.h" #include "src/utils/cpu.h" #if LIBGAV1_ENABLE_NEON #include #include #include #include #include #include #include "src/dsp/arm/common_neon.h" #include "src/dsp/constants.h" #include "src/dsp/dsp.h" #include "src/utils/common.h" #include "src/utils/compiler_attributes.h" namespace libgav1 { namespace dsp { namespace low_bitdepth { namespace { // Blend two values based on weights that sum to 32. inline uint8x8_t WeightedBlend(const uint8x8_t a, const uint8x8_t b, const uint8x8_t a_weight, const uint8x8_t b_weight) { const uint16x8_t a_product = vmull_u8(a, a_weight); const uint16x8_t sum = vmlal_u8(a_product, b, b_weight); return vrshrn_n_u16(sum, 5 /*log2(32)*/); } // For vertical operations the weights are one constant value. inline uint8x8_t WeightedBlend(const uint8x8_t a, const uint8x8_t b, const uint8_t weight) { return WeightedBlend(a, b, vdup_n_u8(32 - weight), vdup_n_u8(weight)); } // Fill |left| and |right| with the appropriate values for a given |base_step|. inline void LoadStepwise(const uint8_t* LIBGAV1_RESTRICT const source, const uint8x8_t left_step, const uint8x8_t right_step, uint8x8_t* left, uint8x8_t* right) { const uint8x16_t mixed = vld1q_u8(source); *left = VQTbl1U8(mixed, left_step); *right = VQTbl1U8(mixed, right_step); } // Handle signed step arguments by ignoring the sign. Negative values are // considered out of range and overwritten later. inline void LoadStepwise(const uint8_t* LIBGAV1_RESTRICT const source, const int8x8_t left_step, const int8x8_t right_step, uint8x8_t* left, uint8x8_t* right) { LoadStepwise(source, vreinterpret_u8_s8(left_step), vreinterpret_u8_s8(right_step), left, right); } // Process 4 or 8 |width| by any |height|. template inline void DirectionalZone1_WxH(uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height, const uint8_t* LIBGAV1_RESTRICT const top, const int xstep, const bool upsampled) { assert(width == 4 || width == 8); const int upsample_shift = static_cast(upsampled); const int scale_bits = 6 - upsample_shift; const int max_base_x = (width + height - 1) << upsample_shift; const int8x8_t max_base = vdup_n_s8(max_base_x); const uint8x8_t top_max_base = vdup_n_u8(top[max_base_x]); const int8x8_t all = vcreate_s8(0x0706050403020100); const int8x8_t even = vcreate_s8(0x0e0c0a0806040200); const int8x8_t base_step = upsampled ? even : all; const int8x8_t right_step = vadd_s8(base_step, vdup_n_s8(1)); int top_x = xstep; int y = 0; do { const int top_base_x = top_x >> scale_bits; if (top_base_x >= max_base_x) { for (int i = y; i < height; ++i) { memset(dst, top[max_base_x], 4 /* width */); dst += stride; } return; } const uint8_t shift = ((top_x << upsample_shift) & 0x3F) >> 1; // Zone2 uses negative values for xstep. Use signed values to compare // |top_base_x| to |max_base_x|. const int8x8_t base_v = vadd_s8(vdup_n_s8(top_base_x), base_step); const uint8x8_t max_base_mask = vclt_s8(base_v, max_base); // 4 wide subsamples the output. 8 wide subsamples the input. if (width == 4) { const uint8x8_t left_values = vld1_u8(top + top_base_x); const uint8x8_t right_values = RightShiftVector<8>(left_values); const uint8x8_t value = WeightedBlend(left_values, right_values, shift); // If |upsampled| is true then extract every other value for output. const uint8x8_t value_stepped = vtbl1_u8(value, vreinterpret_u8_s8(base_step)); const uint8x8_t masked_value = vbsl_u8(max_base_mask, value_stepped, top_max_base); StoreLo4(dst, masked_value); } else /* width == 8 */ { uint8x8_t left_values, right_values; // WeightedBlend() steps up to Q registers. Downsample the input to avoid // doing extra calculations. LoadStepwise(top + top_base_x, base_step, right_step, &left_values, &right_values); const uint8x8_t value = WeightedBlend(left_values, right_values, shift); const uint8x8_t masked_value = vbsl_u8(max_base_mask, value, top_max_base); vst1_u8(dst, masked_value); } dst += stride; top_x += xstep; } while (++y < height); } // Process a multiple of 8 |width| by any |height|. Processes horizontally // before vertically in the hopes of being a little more cache friendly. inline void DirectionalZone1_WxH(uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int width, const int height, const uint8_t* LIBGAV1_RESTRICT const top, const int xstep, const bool upsampled) { assert(width % 8 == 0); const int upsample_shift = static_cast(upsampled); const int scale_bits = 6 - upsample_shift; const int max_base_x = (width + height - 1) << upsample_shift; const int8x8_t max_base = vdup_n_s8(max_base_x); const uint8x8_t top_max_base = vdup_n_u8(top[max_base_x]); const int8x8_t all = vcreate_s8(0x0706050403020100); const int8x8_t even = vcreate_s8(0x0e0c0a0806040200); const int8x8_t base_step = upsampled ? even : all; const int8x8_t right_step = vadd_s8(base_step, vdup_n_s8(1)); const int8x8_t block_step = vdup_n_s8(8 << upsample_shift); int top_x = xstep; int y = 0; do { const int top_base_x = top_x >> scale_bits; if (top_base_x >= max_base_x) { for (int i = y; i < height; ++i) { memset(dst, top[max_base_x], 4 /* width */); dst += stride; } return; } const uint8_t shift = ((top_x << upsample_shift) & 0x3F) >> 1; // Zone2 uses negative values for xstep. Use signed values to compare // |top_base_x| to |max_base_x|. int8x8_t base_v = vadd_s8(vdup_n_s8(top_base_x), base_step); int x = 0; do { const uint8x8_t max_base_mask = vclt_s8(base_v, max_base); // Extract the input values based on |upsampled| here to avoid doing twice // as many calculations. uint8x8_t left_values, right_values; LoadStepwise(top + top_base_x + x, base_step, right_step, &left_values, &right_values); const uint8x8_t value = WeightedBlend(left_values, right_values, shift); const uint8x8_t masked_value = vbsl_u8(max_base_mask, value, top_max_base); vst1_u8(dst + x, masked_value); base_v = vadd_s8(base_v, block_step); x += 8; } while (x < width); top_x += xstep; dst += stride; } while (++y < height); } void DirectionalIntraPredictorZone1_NEON( void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, const void* LIBGAV1_RESTRICT const top_row, const int width, const int height, const int xstep, const bool upsampled_top) { const auto* const top = static_cast(top_row); auto* dst = static_cast(dest); assert(xstep > 0); const int upsample_shift = static_cast(upsampled_top); const uint8x8_t all = vcreate_u8(0x0706050403020100); if (xstep == 64) { assert(!upsampled_top); const uint8_t* top_ptr = top + 1; int y = 0; do { memcpy(dst, top_ptr, width); memcpy(dst + stride, top_ptr + 1, width); memcpy(dst + 2 * stride, top_ptr + 2, width); memcpy(dst + 3 * stride, top_ptr + 3, width); dst += 4 * stride; top_ptr += 4; y += 4; } while (y < height); } else if (width == 4) { DirectionalZone1_WxH<4>(dst, stride, height, top, xstep, upsampled_top); } else if (xstep > 51) { // 7.11.2.10. Intra edge upsample selection process // if ( d <= 0 || d >= 40 ) useUpsample = 0 // For |upsample_top| the delta is from vertical so |prediction_angle - 90|. // In |kDirectionalIntraPredictorDerivative[]| angles less than 51 will meet // this criteria. The |xstep| value for angle 51 happens to be 51 as well. // Shallower angles have greater xstep values. assert(!upsampled_top); const int max_base_x = ((width + height) - 1); const uint8x8_t max_base = vdup_n_u8(max_base_x); const uint8x8_t top_max_base = vdup_n_u8(top[max_base_x]); const uint8x8_t block_step = vdup_n_u8(8); int top_x = xstep; int y = 0; do { const int top_base_x = top_x >> 6; const uint8_t shift = ((top_x << upsample_shift) & 0x3F) >> 1; uint8x8_t base_v = vadd_u8(vdup_n_u8(top_base_x), all); int x = 0; // Only calculate a block of 8 when at least one of the output values is // within range. Otherwise it can read off the end of |top|. const int must_calculate_width = std::min(width, max_base_x - top_base_x + 7) & ~7; for (; x < must_calculate_width; x += 8) { const uint8x8_t max_base_mask = vclt_u8(base_v, max_base); // Since these |xstep| values can not be upsampled the load is // simplified. const uint8x8_t left_values = vld1_u8(top + top_base_x + x); const uint8x8_t right_values = vld1_u8(top + top_base_x + x + 1); const uint8x8_t value = WeightedBlend(left_values, right_values, shift); const uint8x8_t masked_value = vbsl_u8(max_base_mask, value, top_max_base); vst1_u8(dst + x, masked_value); base_v = vadd_u8(base_v, block_step); } memset(dst + x, top[max_base_x], width - x); dst += stride; top_x += xstep; } while (++y < height); } else { DirectionalZone1_WxH(dst, stride, width, height, top, xstep, upsampled_top); } } // Process 4 or 8 |width| by 4 or 8 |height|. template inline void DirectionalZone3_WxH( uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height, const uint8_t* LIBGAV1_RESTRICT const left_column, const int base_left_y, const int ystep, const int upsample_shift) { assert(width == 4 || width == 8); assert(height == 4 || height == 8); const int scale_bits = 6 - upsample_shift; // Zone3 never runs out of left_column values. assert((width + height - 1) << upsample_shift > // max_base_y ((ystep * width) >> scale_bits) + (/* base_step */ 1 << upsample_shift) * (height - 1)); // left_base_y // Limited improvement for 8x8. ~20% faster for 64x64. const uint8x8_t all = vcreate_u8(0x0706050403020100); const uint8x8_t even = vcreate_u8(0x0e0c0a0806040200); const uint8x8_t base_step = upsample_shift ? even : all; const uint8x8_t right_step = vadd_u8(base_step, vdup_n_u8(1)); uint8_t* dst = dest; uint8x8_t left_v[8], right_v[8], value_v[8]; const uint8_t* const left = left_column; const int index_0 = base_left_y; LoadStepwise(left + (index_0 >> scale_bits), base_step, right_step, &left_v[0], &right_v[0]); value_v[0] = WeightedBlend(left_v[0], right_v[0], ((index_0 << upsample_shift) & 0x3F) >> 1); const int index_1 = base_left_y + ystep; LoadStepwise(left + (index_1 >> scale_bits), base_step, right_step, &left_v[1], &right_v[1]); value_v[1] = WeightedBlend(left_v[1], right_v[1], ((index_1 << upsample_shift) & 0x3F) >> 1); const int index_2 = base_left_y + ystep * 2; LoadStepwise(left + (index_2 >> scale_bits), base_step, right_step, &left_v[2], &right_v[2]); value_v[2] = WeightedBlend(left_v[2], right_v[2], ((index_2 << upsample_shift) & 0x3F) >> 1); const int index_3 = base_left_y + ystep * 3; LoadStepwise(left + (index_3 >> scale_bits), base_step, right_step, &left_v[3], &right_v[3]); value_v[3] = WeightedBlend(left_v[3], right_v[3], ((index_3 << upsample_shift) & 0x3F) >> 1); const int index_4 = base_left_y + ystep * 4; LoadStepwise(left + (index_4 >> scale_bits), base_step, right_step, &left_v[4], &right_v[4]); value_v[4] = WeightedBlend(left_v[4], right_v[4], ((index_4 << upsample_shift) & 0x3F) >> 1); const int index_5 = base_left_y + ystep * 5; LoadStepwise(left + (index_5 >> scale_bits), base_step, right_step, &left_v[5], &right_v[5]); value_v[5] = WeightedBlend(left_v[5], right_v[5], ((index_5 << upsample_shift) & 0x3F) >> 1); const int index_6 = base_left_y + ystep * 6; LoadStepwise(left + (index_6 >> scale_bits), base_step, right_step, &left_v[6], &right_v[6]); value_v[6] = WeightedBlend(left_v[6], right_v[6], ((index_6 << upsample_shift) & 0x3F) >> 1); const int index_7 = base_left_y + ystep * 7; LoadStepwise(left + (index_7 >> scale_bits), base_step, right_step, &left_v[7], &right_v[7]); value_v[7] = WeightedBlend(left_v[7], right_v[7], ((index_7 << upsample_shift) & 0x3F) >> 1); // 8x8 transpose. const uint8x16x2_t b0 = vtrnq_u8(vcombine_u8(value_v[0], value_v[4]), vcombine_u8(value_v[1], value_v[5])); const uint8x16x2_t b1 = vtrnq_u8(vcombine_u8(value_v[2], value_v[6]), vcombine_u8(value_v[3], value_v[7])); const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]), vreinterpretq_u16_u8(b1.val[0])); const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]), vreinterpretq_u16_u8(b1.val[1])); const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]), vreinterpretq_u32_u16(c1.val[0])); const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]), vreinterpretq_u32_u16(c1.val[1])); if (width == 4) { StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[0]))); dst += stride; StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[0]))); dst += stride; StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[0]))); dst += stride; StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[0]))); if (height == 4) return; dst += stride; StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[1]))); dst += stride; StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[1]))); dst += stride; StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[1]))); dst += stride; StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[1]))); } else { vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[0]))); dst += stride; vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[0]))); dst += stride; vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[0]))); dst += stride; vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[0]))); if (height == 4) return; dst += stride; vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[1]))); dst += stride; vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[1]))); dst += stride; vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[1]))); dst += stride; vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[1]))); } } // Because the source values "move backwards" as the row index increases, the // indices derived from ystep are generally negative. This is accommodated by // making sure the relative indices are within [-15, 0] when the function is // called, and sliding them into the inclusive range [0, 15], relative to a // lower base address. constexpr int kPositiveIndexOffset = 15; // Process 4 or 8 |width| by any |height|. template inline void DirectionalZone2FromLeftCol_WxH( uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height, const uint8_t* LIBGAV1_RESTRICT const left_column, const int16x8_t left_y, const int upsample_shift) { assert(width == 4 || width == 8); // The shift argument must be a constant. int16x8_t offset_y, shift_upsampled = left_y; if (upsample_shift) { offset_y = vshrq_n_s16(left_y, 5); shift_upsampled = vshlq_n_s16(shift_upsampled, 1); } else { offset_y = vshrq_n_s16(left_y, 6); } // Select values to the left of the starting point. // The 15th element (and 16th) will be all the way at the end, to the right. // With a negative ystep everything else will be "left" of them. // This supports cumulative steps up to 15. We could support up to 16 by doing // separate loads for |left_values| and |right_values|. vtbl supports 2 Q // registers as input which would allow for cumulative offsets of 32. const int16x8_t sampler = vaddq_s16(offset_y, vdupq_n_s16(kPositiveIndexOffset)); const uint8x8_t left_values = vqmovun_s16(sampler); const uint8x8_t right_values = vadd_u8(left_values, vdup_n_u8(1)); const int16x8_t shift_masked = vandq_s16(shift_upsampled, vdupq_n_s16(0x3f)); const uint8x8_t shift_mul = vreinterpret_u8_s8(vshrn_n_s16(shift_masked, 1)); const uint8x8_t inv_shift_mul = vsub_u8(vdup_n_u8(32), shift_mul); int y = 0; do { uint8x8_t src_left, src_right; LoadStepwise(left_column - kPositiveIndexOffset + (y << upsample_shift), left_values, right_values, &src_left, &src_right); const uint8x8_t val = WeightedBlend(src_left, src_right, inv_shift_mul, shift_mul); if (width == 4) { StoreLo4(dst, val); } else { vst1_u8(dst, val); } dst += stride; } while (++y < height); } // Process 4 or 8 |width| by any |height|. template inline void DirectionalZone1Blend_WxH( uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height, const uint8_t* LIBGAV1_RESTRICT const top_row, int zone_bounds, int top_x, const int xstep, const int upsample_shift) { assert(width == 4 || width == 8); const int scale_bits_x = 6 - upsample_shift; const uint8x8_t all = vcreate_u8(0x0706050403020100); const uint8x8_t even = vcreate_u8(0x0e0c0a0806040200); const uint8x8_t base_step = upsample_shift ? even : all; const uint8x8_t right_step = vadd_u8(base_step, vdup_n_u8(1)); int y = 0; do { const uint8_t* const src = top_row + (top_x >> scale_bits_x); uint8x8_t left, right; LoadStepwise(src, base_step, right_step, &left, &right); const uint8_t shift = ((top_x << upsample_shift) & 0x3f) >> 1; const uint8x8_t val = WeightedBlend(left, right, shift); uint8x8_t dst_blend = vld1_u8(dest); // |zone_bounds| values can be negative. uint8x8_t blend = vcge_s8(vreinterpret_s8_u8(all), vdup_n_s8((zone_bounds >> 6))); uint8x8_t output = vbsl_u8(blend, val, dst_blend); if (width == 4) { StoreLo4(dest, output); } else { vst1_u8(dest, output); } dest += stride; zone_bounds += xstep; top_x -= xstep; } while (++y < height); } // 7.11.2.4 (8) 90 < angle > 180 // The strategy for these functions (4xH and 8+xH) is to know how many blocks // can be processed with just pixels from |top_ptr|, then handle mixed blocks, // then handle only blocks that take from |left_ptr|. Additionally, a fast // index-shuffle approach is used for pred values from |left_column| in // sections that permit it. inline void DirectionalZone2_4xH( uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const uint8_t* LIBGAV1_RESTRICT const top_row, const uint8_t* LIBGAV1_RESTRICT const left_column, const int height, const int xstep, const int ystep, const bool upsampled_top, const bool upsampled_left) { const int upsample_left_shift = static_cast(upsampled_left); const int upsample_top_shift = static_cast(upsampled_top); // Helper vector. const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7}; // Loop incrementers for moving by block (4xN). Vertical still steps by 8. If // it's only 4, it will be finished in the first iteration. const ptrdiff_t stride8 = stride << 3; const int xstep8 = xstep << 3; const int min_height = (height == 4) ? 4 : 8; // All columns from |min_top_only_x| to the right will only need |top_row| to // compute and can therefore call the Zone1 functions. This assumes |xstep| is // at least 3. assert(xstep >= 3); const int min_top_only_x = std::min((height * xstep) >> 6, /* width */ 4); // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1 int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1; const int left_base_increment = ystep >> 6; const int ystep_remainder = ystep & 0x3F; // If the 64 scaling is regarded as a decimal point, the first value of the // left_y vector omits the portion which is covered under the left_column // offset. The following values need the full ystep as a relative offset. const int16x8_t remainder = vdupq_n_s16(-ystep_remainder); const int16x8_t left_y = vmlaq_n_s16(remainder, zero_to_seven, -ystep); // This loop treats each set of 4 columns in 3 stages with y-value boundaries. // The first stage, before the first y-loop, covers blocks that are only // computed from the top row. The second stage, comprising two y-loops, covers // blocks that have a mixture of values computed from top or left. The final // stage covers blocks that are only computed from the left. if (min_top_only_x > 0) { // Round down to the nearest multiple of 8 (or 4, if height is 4). const int max_top_only_y = std::min((1 << 6) / xstep, height) & ~(min_height - 1); DirectionalZone1_WxH<4>(dst, stride, max_top_only_y, top_row, -xstep, upsampled_top); if (max_top_only_y == height) return; int y = max_top_only_y; dst += stride * y; const int xstep_y = xstep * y; // All rows from |min_left_only_y| down for this set of columns only need // |left_column| to compute. const int min_left_only_y = std::min((4 << 6) / xstep, height); int xstep_bounds = xstep_bounds_base + xstep_y; int top_x = -xstep - xstep_y; // +8 increment is OK because if height is 4 this only goes once. for (; y < min_left_only_y; y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) { DirectionalZone2FromLeftCol_WxH<4>( dst, stride, min_height, left_column + ((y - left_base_increment) << upsample_left_shift), left_y, upsample_left_shift); DirectionalZone1Blend_WxH<4>(dst, stride, min_height, top_row, xstep_bounds, top_x, xstep, upsample_top_shift); } // Loop over y for left_only rows. const int16_t base_left_y = vgetq_lane_s16(left_y, 0); for (; y < height; y += 8, dst += stride8) { DirectionalZone3_WxH<4>( dst, stride, min_height, left_column + ((y - left_base_increment) << upsample_left_shift), base_left_y, -ystep, upsample_left_shift); } } else { DirectionalZone1_WxH<4>(dst, stride, height, top_row, -xstep, upsampled_top); } } template inline void DirectionalZone2_8xH( uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride, const uint8_t* LIBGAV1_RESTRICT const top_row, const uint8_t* LIBGAV1_RESTRICT const left_column, const int height, const int xstep, const int ystep, const int x, const int left_offset, const int xstep_bounds_base, const int16x8_t left_y, const bool upsampled_top, const bool upsampled_left) { const int upsample_left_shift = static_cast(upsampled_left); const int upsample_top_shift = static_cast(upsampled_top); // Loop incrementers for moving by block (8x8). This function handles blocks // with height 4 as well. They are calculated in one pass so these variables // do not get used. const ptrdiff_t stride8 = stride << 3; const int xstep8 = xstep << 3; // Cover 8x4 case. const int min_height = (height == 4) ? 4 : 8; // The first stage, before the first y-loop, covers blocks that are only // computed from the top row. The second stage, comprising two y-loops, covers // blocks that have a mixture of values computed from top or left. The final // stage covers blocks that are only computed from the left. uint8_t* dst_x = dst + x; // Round down to the nearest multiple of 8 (or 4, if height is 4). const int max_top_only_y = std::min((1 << 6) / xstep, height) & ~(min_height - 1); DirectionalZone1_WxH<8>(dst_x, stride, max_top_only_y, top_row + (x << upsample_top_shift), -xstep, upsampled_top); if (max_top_only_y == height) return; int y = max_top_only_y; dst_x += stride * y; const int xstep_y = xstep * y; // All rows from |min_left_only_y| down for this set of columns only need // |left_column| to compute. Round up to the nearest 8. const int min_left_only_y = Align(std::min(((x + 8) << 6) / xstep, height), 8); int xstep_bounds = xstep_bounds_base + xstep_y; int top_x = -xstep - xstep_y; const int16_t base_left_y = vgetq_lane_s16(left_y, 0); for (; y < min_left_only_y; y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) { if (shuffle_left_column) { DirectionalZone2FromLeftCol_WxH<8>( dst_x, stride, min_height, left_column + ((left_offset + y) << upsample_left_shift), left_y, upsample_left_shift); } else { DirectionalZone3_WxH<8>( dst_x, stride, min_height, left_column + ((left_offset + y) << upsample_left_shift), base_left_y, -ystep, upsample_left_shift); } DirectionalZone1Blend_WxH<8>( dst_x, stride, min_height, top_row + (x << upsample_top_shift), xstep_bounds, top_x, xstep, upsample_top_shift); } // Loop over y for left_only rows. for (; y < height; y += 8, dst_x += stride8) { DirectionalZone3_WxH<8>( dst_x, stride, min_height, left_column + ((left_offset + y) << upsample_left_shift), base_left_y, -ystep, upsample_left_shift); } } // Process a multiple of 8 |width|. inline void DirectionalZone2_WxH( uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride, const uint8_t* LIBGAV1_RESTRICT const top_row, const uint8_t* LIBGAV1_RESTRICT const left_column, const int width, const int height, const int xstep, const int ystep, const bool upsampled_top, const bool upsampled_left) { const int ystep8 = ystep << 3; // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1 int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1; const int left_base_increment = ystep >> 6; const int ystep_remainder = ystep & 0x3F; const int left_base_increment8 = ystep8 >> 6; const int ystep_remainder8 = ystep8 & 0x3F; const int16x8_t increment_left8 = vdupq_n_s16(ystep_remainder8); // If the 64 scaling is regarded as a decimal point, the first value of the // left_y vector omits the portion which is covered under the left_column // offset. Following values need the full ystep as a relative offset. const int16x8_t remainder = vdupq_n_s16(-ystep_remainder); const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7}; int16x8_t left_y = vmlaq_n_s16(remainder, zero_to_seven, -ystep); // For ystep > 90, at least two sets of 8 columns can be fully computed from // top_row only. const int min_top_only_x = std::min((height * xstep) >> 6, width); // Analysis finds that, for most angles (ystep < 132), all segments that use // both top_row and left_column can compute from left_column using byte // shuffles from a single vector. For steeper angles, the shuffle is also // fully reliable when x >= 32. const int shuffle_left_col_x = (ystep < 132) ? 0 : 32; const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x); // This loop treats each set of 4 columns in 3 stages with y-value boundaries. // The first stage, before the first y-loop, covers blocks that are only // computed from the top row. The second stage, comprising two y-loops, covers // blocks that have a mixture of values computed from top or left. The final // stage covers blocks that are only computed from the left. int x = 0; for (int left_offset = -left_base_increment; x < min_shuffle_x; x += 8, xstep_bounds_base -= (8 << 6), left_y = vsubq_s16(left_y, increment_left8), left_offset -= left_base_increment8) { DirectionalZone2_8xH(dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset, xstep_bounds_base, left_y, upsampled_top, upsampled_left); } for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8, xstep_bounds_base -= (8 << 6), left_y = vsubq_s16(left_y, increment_left8), left_offset -= left_base_increment8) { DirectionalZone2_8xH(dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset, xstep_bounds_base, left_y, upsampled_top, upsampled_left); } if (x < width) { const int upsample_top_shift = static_cast(upsampled_top); DirectionalZone1_WxH(dst + x, stride, width - x, height, top_row + (x << upsample_top_shift), -xstep, upsampled_top); } } void DirectionalIntraPredictorZone2_NEON( void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, const void* LIBGAV1_RESTRICT const top_row, const void* LIBGAV1_RESTRICT const left_column, const int width, const int height, const int xstep, const int ystep, const bool upsampled_top, const bool upsampled_left) { // Increasing the negative buffer for this function allows more rows to be // processed at a time without branching in an inner loop to check the base. uint8_t top_buffer[288]; uint8_t left_buffer[288]; #if LIBGAV1_MSAN memset(top_buffer, 0, sizeof(top_buffer)); memset(left_buffer, 0, sizeof(left_buffer)); #endif // LIBGAV1_MSAN memcpy(top_buffer + 128, static_cast(top_row) - 16, 160); memcpy(left_buffer + 128, static_cast(left_column) - 16, 160); const uint8_t* top_ptr = top_buffer + 144; const uint8_t* left_ptr = left_buffer + 144; auto* dst = static_cast(dest); if (width == 4) { DirectionalZone2_4xH(dst, stride, top_ptr, left_ptr, height, xstep, ystep, upsampled_top, upsampled_left); } else { DirectionalZone2_WxH(dst, stride, top_ptr, left_ptr, width, height, xstep, ystep, upsampled_top, upsampled_left); } } void DirectionalIntraPredictorZone3_NEON( void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, const void* LIBGAV1_RESTRICT const left_column, const int width, const int height, const int ystep, const bool upsampled_left) { const auto* const left = static_cast(left_column); assert(ystep > 0); const int upsample_shift = static_cast(upsampled_left); const int scale_bits = 6 - upsample_shift; const int base_step = 1 << upsample_shift; if (width == 4 || height == 4) { // This block can handle all sizes but the specializations for other sizes // are faster. const uint8x8_t all = vcreate_u8(0x0706050403020100); const uint8x8_t even = vcreate_u8(0x0e0c0a0806040200); const uint8x8_t base_step_v = upsampled_left ? even : all; const uint8x8_t right_step = vadd_u8(base_step_v, vdup_n_u8(1)); int y = 0; do { int x = 0; do { auto* dst = static_cast(dest); dst += y * stride + x; uint8x8_t left_v[4], right_v[4], value_v[4]; const int ystep_base = ystep * x; const int offset = y * base_step; const int index_0 = ystep_base + ystep * 1; LoadStepwise(left + offset + (index_0 >> scale_bits), base_step_v, right_step, &left_v[0], &right_v[0]); value_v[0] = WeightedBlend(left_v[0], right_v[0], ((index_0 << upsample_shift) & 0x3F) >> 1); const int index_1 = ystep_base + ystep * 2; LoadStepwise(left + offset + (index_1 >> scale_bits), base_step_v, right_step, &left_v[1], &right_v[1]); value_v[1] = WeightedBlend(left_v[1], right_v[1], ((index_1 << upsample_shift) & 0x3F) >> 1); const int index_2 = ystep_base + ystep * 3; LoadStepwise(left + offset + (index_2 >> scale_bits), base_step_v, right_step, &left_v[2], &right_v[2]); value_v[2] = WeightedBlend(left_v[2], right_v[2], ((index_2 << upsample_shift) & 0x3F) >> 1); const int index_3 = ystep_base + ystep * 4; LoadStepwise(left + offset + (index_3 >> scale_bits), base_step_v, right_step, &left_v[3], &right_v[3]); value_v[3] = WeightedBlend(left_v[3], right_v[3], ((index_3 << upsample_shift) & 0x3F) >> 1); // 8x4 transpose. const uint8x8x2_t b0 = vtrn_u8(value_v[0], value_v[1]); const uint8x8x2_t b1 = vtrn_u8(value_v[2], value_v[3]); const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0])); const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1])); StoreLo4(dst, vreinterpret_u8_u16(c0.val[0])); dst += stride; StoreLo4(dst, vreinterpret_u8_u16(c1.val[0])); dst += stride; StoreLo4(dst, vreinterpret_u8_u16(c0.val[1])); dst += stride; StoreLo4(dst, vreinterpret_u8_u16(c1.val[1])); if (height > 4) { dst += stride; StoreHi4(dst, vreinterpret_u8_u16(c0.val[0])); dst += stride; StoreHi4(dst, vreinterpret_u8_u16(c1.val[0])); dst += stride; StoreHi4(dst, vreinterpret_u8_u16(c0.val[1])); dst += stride; StoreHi4(dst, vreinterpret_u8_u16(c1.val[1])); } x += 4; } while (x < width); y += 8; } while (y < height); } else { // 8x8 at a time. // Limited improvement for 8x8. ~20% faster for 64x64. int y = 0; do { int x = 0; do { auto* dst = static_cast(dest); dst += y * stride + x; const int ystep_base = ystep * (x + 1); DirectionalZone3_WxH<8>(dst, stride, 8, left + (y << upsample_shift), ystep_base, ystep, upsample_shift); x += 8; } while (x < width); y += 8; } while (y < height); } } void Init8bpp() { Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); assert(dsp != nullptr); dsp->directional_intra_predictor_zone1 = DirectionalIntraPredictorZone1_NEON; dsp->directional_intra_predictor_zone2 = DirectionalIntraPredictorZone2_NEON; dsp->directional_intra_predictor_zone3 = DirectionalIntraPredictorZone3_NEON; } } // namespace } // namespace low_bitdepth #if LIBGAV1_MAX_BITDEPTH >= 10 namespace high_bitdepth { namespace { // Blend two values based on weights that sum to 32. inline uint16x4_t WeightedBlend(const uint16x4_t a, const uint16x4_t b, const int a_weight, const int b_weight) { const uint16x4_t a_product = vmul_n_u16(a, a_weight); const uint16x4_t sum = vmla_n_u16(a_product, b, b_weight); return vrshr_n_u16(sum, 5 /*log2(32)*/); } // Blend two values based on weights that sum to 32. inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b, const uint16_t a_weight, const uint16_t b_weight) { const uint16x8_t a_product = vmulq_n_u16(a, a_weight); const uint16x8_t sum = vmlaq_n_u16(a_product, b, b_weight); return vrshrq_n_u16(sum, 5 /*log2(32)*/); } // Blend two values based on weights that sum to 32. inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b, const uint16x8_t a_weight, const uint16x8_t b_weight) { const uint16x8_t a_product = vmulq_u16(a, a_weight); const uint16x8_t sum = vmlaq_u16(a_product, b, b_weight); return vrshrq_n_u16(sum, 5 /*log2(32)*/); } // Each element of |dest| contains values associated with one weight value. inline void LoadEdgeVals(uint16x4x2_t* dest, const uint16_t* LIBGAV1_RESTRICT const source, const bool upsampled) { if (upsampled) { *dest = vld2_u16(source); } else { dest->val[0] = vld1_u16(source); dest->val[1] = vld1_u16(source + 1); } } // Each element of |dest| contains values associated with one weight value. inline void LoadEdgeVals(uint16x8x2_t* dest, const uint16_t* LIBGAV1_RESTRICT const source, const bool upsampled) { if (upsampled) { *dest = vld2q_u16(source); } else { dest->val[0] = vld1q_u16(source); dest->val[1] = vld1q_u16(source + 1); } } // For Wx4 blocks, load the source for 2 columns. The source for the second // column is held in the high half of each vector. inline void LoadEdgeVals2x4(uint16x8x2_t* dest, const uint16_t* LIBGAV1_RESTRICT const source_low, const uint16_t* LIBGAV1_RESTRICT const source_high, const bool upsampled) { if (upsampled) { const uint16x4x2_t low = vld2_u16(source_low); const uint16x4x2_t high = vld2_u16(source_high); dest->val[0] = vcombine_u16(low.val[0], high.val[0]); dest->val[1] = vcombine_u16(low.val[1], high.val[1]); } else { dest->val[0] = vcombine_u16(vld1_u16(source_low), vld1_u16(source_high)); dest->val[1] = vcombine_u16(vld1_u16(source_low + 1), vld1_u16(source_high + 1)); } } template inline void DirectionalZone1_4xH(uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height, const uint16_t* LIBGAV1_RESTRICT const top, const int xstep) { const int upsample_shift = static_cast(upsampled); const int index_scale_bits = 6 - upsample_shift; const int max_base_x = (4 + height - 1) << upsample_shift; const int16x4_t max_base = vdup_n_s16(max_base_x); const uint16x4_t final_top_val = vdup_n_u16(top[max_base_x]); const int16x4_t index_offset = {0, 1, 2, 3}; // All rows from |min_corner_only_y| down will simply use Memset. // |max_base_x| is always greater than |height|, so clipping the denominator // to 1 is enough to make the logic work. const int xstep_units = std::max(xstep >> index_scale_bits, 1); const int min_corner_only_y = std::min(max_base_x / xstep_units, height); int top_x = xstep; int y = 0; for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) { const int top_base_x = top_x >> index_scale_bits; // To accommodate reuse of this function in Zone2, permit negative values // for |xstep|. const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1; const uint16_t shift_1 = 32 - shift_0; // Use signed values to compare |top_base_x| to |max_base_x|. const int16x4_t base_x = vadd_s16(vdup_n_s16(top_base_x), index_offset); const uint16x4_t max_base_mask = vclt_s16(base_x, max_base); uint16x4x2_t sampled_top_row; LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled); const uint16x4_t combined = WeightedBlend( sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0); // If |upsampled| is true then extract every other value for output. const uint16x4_t masked_result = vbsl_u16(max_base_mask, combined, final_top_val); vst1_u16(dst, masked_result); } for (; y < height; ++y) { Memset(dst, top[max_base_x], 4 /* width */); dst += stride; } } // Process a multiple of 8 |width| by any |height|. Processes horizontally // before vertically in the hopes of being a little more cache friendly. template inline void DirectionalZone1_WxH(uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int width, const int height, const uint16_t* LIBGAV1_RESTRICT const top, const int xstep) { assert(width % 8 == 0); const int upsample_shift = static_cast(upsampled); const int index_scale_bits = 6 - upsample_shift; const int max_base_index = (width + height - 1) << upsample_shift; const int16x8_t max_base_x = vdupq_n_s16(max_base_index); const uint16x8_t final_top_val = vdupq_n_u16(top[max_base_index]); const int16x8_t index_offset = {0, 1, 2, 3, 4, 5, 6, 7}; const int base_step = 1 << upsample_shift; const int base_step8 = base_step << 3; const int16x8_t block_step = vdupq_n_s16(base_step8); // All rows from |min_corner_only_y| down will simply use Memset. // |max_base_x| is always greater than |height|, so clipping the denominator // to 1 is enough to make the logic work. const int xstep_units = std::max(xstep >> index_scale_bits, 1); const int min_corner_only_y = std::min(max_base_index / xstep_units, height); int top_x = xstep; int y = 0; for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) { int top_base_x = top_x >> index_scale_bits; // To accommodate reuse of this function in Zone2, permit negative values // for |xstep|. const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1; const uint16_t shift_1 = 32 - shift_0; // Use signed values to compare |top_base_x| to |max_base_x|. int16x8_t base_x = vaddq_s16(vdupq_n_s16(top_base_x), index_offset); int x = 0; do { const uint16x8_t max_base_mask = vcltq_s16(base_x, max_base_x); uint16x8x2_t sampled_top_row; LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled); const uint16x8_t combined = WeightedBlend( sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0); const uint16x8_t masked_result = vbslq_u16(max_base_mask, combined, final_top_val); vst1q_u16(dst + x, masked_result); base_x = vaddq_s16(base_x, block_step); top_base_x += base_step8; x += 8; } while (x < width); } for (int i = y; i < height; ++i) { Memset(dst, top[max_base_index], width); dst += stride; } } // Process a multiple of 8 |width| by any |height|. Processes horizontally // before vertically in the hopes of being a little more cache friendly. inline void DirectionalZone1_Large(uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int width, const int height, const uint16_t* LIBGAV1_RESTRICT const top, const int xstep, const bool upsampled) { assert(width % 8 == 0); const int upsample_shift = static_cast(upsampled); const int index_scale_bits = 6 - upsample_shift; const int max_base_index = (width + height - 1) << upsample_shift; const int16x8_t max_base_x = vdupq_n_s16(max_base_index); const uint16x8_t final_top_val = vdupq_n_u16(top[max_base_index]); const int16x8_t index_offset = {0, 1, 2, 3, 4, 5, 6, 7}; const int base_step = 1 << upsample_shift; const int base_step8 = base_step << 3; const int16x8_t block_step = vdupq_n_s16(base_step8); // All rows from |min_corner_only_y| down will simply use Memset. // |max_base_x| is always greater than |height|, so clipping the denominator // to 1 is enough to make the logic work. const int xstep_units = std::max(xstep >> index_scale_bits, 1); const int min_corner_only_y = std::min(max_base_index / xstep_units, height); // Rows up to this y-value can be computed without checking for bounds. const int max_no_corner_y = std::min( ((max_base_index - (base_step * width)) << index_scale_bits) / xstep, height); // No need to check for exceeding |max_base_x| in the first loop. int y = 0; int top_x = xstep; for (; y < max_no_corner_y; ++y, dst += stride, top_x += xstep) { int top_base_x = top_x >> index_scale_bits; // To accommodate reuse of this function in Zone2, permit negative values // for |xstep|. const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1; const uint16_t shift_1 = 32 - shift_0; int x = 0; do { uint16x8x2_t sampled_top_row; LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled); const uint16x8_t combined = WeightedBlend( sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0); vst1q_u16(dst + x, combined); top_base_x += base_step8; x += 8; } while (x < width); } for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) { int top_base_x = top_x >> index_scale_bits; // To accommodate reuse of this function in Zone2, permit negative values // for |xstep|. const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1; const uint16_t shift_1 = 32 - shift_0; // Use signed values to compare |top_base_x| to |max_base_x|. int16x8_t base_x = vaddq_s16(vdupq_n_s16(top_base_x), index_offset); int x = 0; const int min_corner_only_x = std::min(width, ((max_base_index - top_base_x) >> upsample_shift) + 7) & ~7; for (; x < min_corner_only_x; x += 8, top_base_x += base_step8, base_x = vaddq_s16(base_x, block_step)) { const uint16x8_t max_base_mask = vcltq_s16(base_x, max_base_x); uint16x8x2_t sampled_top_row; LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled); const uint16x8_t combined = WeightedBlend( sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0); const uint16x8_t masked_result = vbslq_u16(max_base_mask, combined, final_top_val); vst1q_u16(dst + x, masked_result); } // Corner-only section of the row. Memset(dst + x, top[max_base_index], width - x); } for (; y < height; ++y) { Memset(dst, top[max_base_index], width); dst += stride; } } void DirectionalIntraPredictorZone1_NEON( void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, const void* LIBGAV1_RESTRICT const top_row, const int width, const int height, const int xstep, const bool upsampled_top) { const auto* const top = static_cast(top_row); auto* dst = static_cast(dest); stride /= sizeof(top[0]); assert(xstep > 0); if (xstep == 64) { assert(!upsampled_top); const uint16_t* top_ptr = top + 1; const int width_bytes = width * sizeof(top[0]); int y = height; do { memcpy(dst, top_ptr, width_bytes); memcpy(dst + stride, top_ptr + 1, width_bytes); memcpy(dst + 2 * stride, top_ptr + 2, width_bytes); memcpy(dst + 3 * stride, top_ptr + 3, width_bytes); dst += 4 * stride; top_ptr += 4; y -= 4; } while (y != 0); } else { if (width == 4) { if (upsampled_top) { DirectionalZone1_4xH(dst, stride, height, top, xstep); } else { DirectionalZone1_4xH(dst, stride, height, top, xstep); } } else if (width >= 32) { if (upsampled_top) { DirectionalZone1_Large(dst, stride, width, height, top, xstep, true); } else { DirectionalZone1_Large(dst, stride, width, height, top, xstep, false); } } else if (upsampled_top) { DirectionalZone1_WxH(dst, stride, width, height, top, xstep); } else { DirectionalZone1_WxH(dst, stride, width, height, top, xstep); } } } // ----------------------------------------------------------------------------- // Zone 3 // This can be considered "the transpose of Zone 1." In Zone 1, the fractional // step applies when moving vertically in the destination block, connected to // the change in |y|, whereas in this mode, the step applies when moving // horizontally, connected to the change in |x|. This makes vectorization very // complicated in row-order, because a given vector may need source pixels that // span 16 or 32 pixels in steep angles, requiring multiple expensive table // lookups and checked loads. Rather than work in row order, it is simpler to // compute |dest| in column order, and then store the transposed results. // Compute 4x4 sub-blocks. // Example of computed sub-blocks of a 4x8 block before and after transpose: // 00 10 20 30 00 01 02 03 // 01 11 21 31 10 11 12 13 // 02 12 22 32 20 21 22 23 // 03 13 23 33 30 31 32 33 // ----------- --> ----------- // 40 50 60 70 40 41 42 43 // 41 51 61 71 50 51 52 53 // 42 52 62 72 60 61 62 63 // 43 53 63 73 70 71 72 73 template inline void DirectionalZone3_4x4(uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const uint16_t* LIBGAV1_RESTRICT const left, const int ystep, const int base_left_y = 0) { const int upsample_shift = static_cast(upsampled); const int index_scale_bits = 6 - upsample_shift; // Compute one column at a time, then transpose for storage. uint16x4_t result[4]; int left_y = base_left_y + ystep; int left_offset = left_y >> index_scale_bits; int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; int shift_1 = 32 - shift_0; uint16x4x2_t sampled_left_col; LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], shift_1, shift_0); left_y += ystep; left_offset = left_y >> index_scale_bits; shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; shift_1 = 32 - shift_0; LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], shift_1, shift_0); left_y += ystep; left_offset = left_y >> index_scale_bits; shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; shift_1 = 32 - shift_0; LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], shift_1, shift_0); left_y += ystep; left_offset = left_y >> index_scale_bits; shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; shift_1 = 32 - shift_0; LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], shift_1, shift_0); Transpose4x4(result); Store4(dst, result[0]); dst += stride; Store4(dst, result[1]); dst += stride; Store4(dst, result[2]); dst += stride; Store4(dst, result[3]); } template inline void DirectionalZone3_8x4(uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const uint16_t* LIBGAV1_RESTRICT const left, const int ystep, const int base_left_y = 0) { const int upsample_shift = static_cast(upsampled); const int index_scale_bits = 6 - upsample_shift; const uint16x8_t inverter = vdupq_n_u16(32); uint16x8x2_t sampled_left_col; // Compute two columns at a time, then transpose for storage. uint16x8_t result[4]; // The low half of pre-transpose vectors contains columns 0 through 3. int left_y_low = base_left_y + ystep; int left_offset_low = left_y_low >> index_scale_bits; int shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1; // The high half of pre-transpose vectors contains columns 4 through 7. int left_y_high = left_y_low + (ystep << 2); int left_offset_high = left_y_high >> index_scale_bits; int shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1; uint16x8_t weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high)); uint16x8_t weights_1 = vsubq_u16(inverter, weights_0); LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low], &left[left_offset_high], upsampled); result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], weights_1, weights_0); left_y_low += ystep; left_offset_low = left_y_low >> index_scale_bits; shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1; left_y_high += ystep; left_offset_high = left_y_high >> index_scale_bits; shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1; weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high)); weights_1 = vsubq_u16(inverter, weights_0); LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low], &left[left_offset_high], upsampled); result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], weights_1, weights_0); left_y_high += ystep; left_y_low += ystep; left_offset_low = left_y_low >> index_scale_bits; shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1; left_offset_high = left_y_high >> index_scale_bits; shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1; weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high)); weights_1 = vsubq_u16(inverter, weights_0); LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low], &left[left_offset_high], upsampled); result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], weights_1, weights_0); left_y_low += ystep; left_offset_low = left_y_low >> index_scale_bits; shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1; left_y_high += ystep; left_offset_high = left_y_high >> index_scale_bits; shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1; weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high)); weights_1 = vsubq_u16(inverter, weights_0); LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low], &left[left_offset_high], upsampled); result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], weights_1, weights_0); Transpose4x8(result); Store8(dst, result[0]); dst += stride; Store8(dst, result[1]); dst += stride; Store8(dst, result[2]); dst += stride; Store8(dst, result[3]); } template inline void DirectionalZone3_4x8(uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const uint16_t* LIBGAV1_RESTRICT const left, const int ystep, const int base_left_y = 0) { const int upsample_shift = static_cast(upsampled); const int index_scale_bits = 6 - upsample_shift; // Compute one column at a time, then transpose for storage. uint16x8_t result[4]; int left_y = base_left_y + ystep; int left_offset = left_y >> index_scale_bits; int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; int shift_1 = 32 - shift_0; uint16x8x2_t sampled_left_col; LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], shift_1, shift_0); left_y += ystep; left_offset = left_y >> index_scale_bits; shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; shift_1 = 32 - shift_0; LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], shift_1, shift_0); left_y += ystep; left_offset = left_y >> index_scale_bits; shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; shift_1 = 32 - shift_0; LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], shift_1, shift_0); left_y += ystep; left_offset = left_y >> index_scale_bits; shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; shift_1 = 32 - shift_0; LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], shift_1, shift_0); Transpose4x8(result); Store4(dst, vget_low_u16(result[0])); dst += stride; Store4(dst, vget_low_u16(result[1])); dst += stride; Store4(dst, vget_low_u16(result[2])); dst += stride; Store4(dst, vget_low_u16(result[3])); dst += stride; Store4(dst, vget_high_u16(result[0])); dst += stride; Store4(dst, vget_high_u16(result[1])); dst += stride; Store4(dst, vget_high_u16(result[2])); dst += stride; Store4(dst, vget_high_u16(result[3])); } template inline void DirectionalZone3_4xH(uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height, const uint16_t* LIBGAV1_RESTRICT const left, const int ystep) { assert(height == 8 || height == 16); const int upsample_shift = static_cast(upsampled); DirectionalZone3_4x8(dest, stride, left, ystep); if (height == 16) { dest += stride << 3; DirectionalZone3_4x8(dest, stride, left + (8 << upsample_shift), ystep); } } template inline void DirectionalZone3_Wx4(uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int width, const uint16_t* LIBGAV1_RESTRICT const left, const int ystep) { assert(width <= 16); if (width == 4) { DirectionalZone3_4x4(dest, stride, left, ystep); return; } DirectionalZone3_8x4(dest, stride, left, ystep); if (width == 16) { const int base_left_y = ystep << 3; DirectionalZone3_8x4(dest + 8 * sizeof(uint16_t), stride, left, ystep, base_left_y); } } template inline void DirectionalZone3_8x8(uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const uint16_t* LIBGAV1_RESTRICT const left, const int ystep, const int base_left_y = 0) { const int upsample_shift = static_cast(upsampled); const int index_scale_bits = 6 - upsample_shift; // Compute one column at a time, then transpose for storage. uint16x8_t result[8]; int left_y = base_left_y + ystep; uint16x8x2_t sampled_left_col; int left_offset = left_y >> index_scale_bits; int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; int shift_1 = 32 - shift_0; LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], shift_1, shift_0); left_y += ystep; left_offset = left_y >> index_scale_bits; shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; shift_1 = 32 - shift_0; LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], shift_1, shift_0); left_y += ystep; left_offset = left_y >> index_scale_bits; shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; shift_1 = 32 - shift_0; LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], shift_1, shift_0); left_y += ystep; left_offset = left_y >> index_scale_bits; shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; shift_1 = 32 - shift_0; LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], shift_1, shift_0); left_y += ystep; left_offset = left_y >> index_scale_bits; shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; shift_1 = 32 - shift_0; LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); result[4] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], shift_1, shift_0); left_y += ystep; left_offset = left_y >> index_scale_bits; shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; shift_1 = 32 - shift_0; LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); result[5] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], shift_1, shift_0); left_y += ystep; left_offset = left_y >> index_scale_bits; shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; shift_1 = 32 - shift_0; LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); result[6] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], shift_1, shift_0); left_y += ystep; left_offset = left_y >> index_scale_bits; shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; shift_1 = 32 - shift_0; LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); result[7] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], shift_1, shift_0); Transpose8x8(result); Store8(dest, result[0]); dest += stride; Store8(dest, result[1]); dest += stride; Store8(dest, result[2]); dest += stride; Store8(dest, result[3]); dest += stride; Store8(dest, result[4]); dest += stride; Store8(dest, result[5]); dest += stride; Store8(dest, result[6]); dest += stride; Store8(dest, result[7]); } template inline void DirectionalZone3_WxH(uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int width, const int height, const uint16_t* LIBGAV1_RESTRICT const left, const int ystep) { const int upsample_shift = static_cast(upsampled); // Zone3 never runs out of left_column values. assert((width + height - 1) << upsample_shift > // max_base_y ((ystep * width) >> (6 - upsample_shift)) + (/* base_step */ 1 << upsample_shift) * (height - 1)); // left_base_y int y = 0; do { int x = 0; uint8_t* dst_x = dest + y * stride; do { const int base_left_y = ystep * x; DirectionalZone3_8x8( dst_x, stride, left + (y << upsample_shift), ystep, base_left_y); dst_x += 8 * sizeof(uint16_t); x += 8; } while (x < width); y += 8; } while (y < height); } void DirectionalIntraPredictorZone3_NEON( void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, const void* LIBGAV1_RESTRICT const left_column, const int width, const int height, const int ystep, const bool upsampled_left) { const auto* const left = static_cast(left_column); auto* dst = static_cast(dest); if (ystep == 64) { assert(!upsampled_left); const int width_bytes = width * sizeof(left[0]); int y = height; do { const uint16_t* left_ptr = left + 1; memcpy(dst, left_ptr, width_bytes); memcpy(dst + stride, left_ptr + 1, width_bytes); memcpy(dst + 2 * stride, left_ptr + 2, width_bytes); memcpy(dst + 3 * stride, left_ptr + 3, width_bytes); dst += 4 * stride; left_ptr += 4; y -= 4; } while (y != 0); return; } if (height == 4) { if (upsampled_left) { DirectionalZone3_Wx4(dst, stride, width, left, ystep); } else { DirectionalZone3_Wx4(dst, stride, width, left, ystep); } } else if (width == 4) { if (upsampled_left) { DirectionalZone3_4xH(dst, stride, height, left, ystep); } else { DirectionalZone3_4xH(dst, stride, height, left, ystep); } } else { if (upsampled_left) { // |upsampled_left| can only be true if |width| + |height| <= 16, // therefore this is 8x8. DirectionalZone3_8x8(dst, stride, left, ystep); } else { DirectionalZone3_WxH(dst, stride, width, height, left, ystep); } } } // ----------------------------------------------------------------------------- // Zone2 // This function deals with cases not found in zone 1 or zone 3. The extreme // angles are 93, which makes for sharp ascents along |left_column| with each // successive dest row element until reaching |top_row|, and 177, with a shallow // ascent up |left_column| until reaching large jumps along |top_row|. In the // extremely steep cases, source vectors can only be loaded one lane at a time. // Fill |left| and |right| with the appropriate values for a given |base_step|. inline void LoadStepwise(const void* LIBGAV1_RESTRICT const source, const uint8x8_t left_step, const uint8x8_t right_step, uint16x4_t* left, uint16x4_t* right) { const uint8x16x2_t mixed = { vld1q_u8(static_cast(source)), vld1q_u8(static_cast(source) + 16)}; *left = vreinterpret_u16_u8(VQTbl2U8(mixed, left_step)); *right = vreinterpret_u16_u8(VQTbl2U8(mixed, right_step)); } inline void LoadStepwise(const void* LIBGAV1_RESTRICT const source, const uint8x8_t left_step_0, const uint8x8_t right_step_0, const uint8x8_t left_step_1, const uint8x8_t right_step_1, uint16x8_t* left, uint16x8_t* right) { const uint8x16x2_t mixed = { vld1q_u8(static_cast(source)), vld1q_u8(static_cast(source) + 16)}; const uint16x4_t left_low = vreinterpret_u16_u8(VQTbl2U8(mixed, left_step_0)); const uint16x4_t left_high = vreinterpret_u16_u8(VQTbl2U8(mixed, left_step_1)); *left = vcombine_u16(left_low, left_high); const uint16x4_t right_low = vreinterpret_u16_u8(VQTbl2U8(mixed, right_step_0)); const uint16x4_t right_high = vreinterpret_u16_u8(VQTbl2U8(mixed, right_step_1)); *right = vcombine_u16(right_low, right_high); } // Blend two values based on weight pairs that each sum to 32. inline uint16x4_t WeightedBlend(const uint16x4_t a, const uint16x4_t b, const uint16x4_t a_weight, const uint16x4_t b_weight) { const uint16x4_t a_product = vmul_u16(a, a_weight); const uint16x4_t sum = vmla_u16(a_product, b, b_weight); return vrshr_n_u16(sum, 5 /*log2(32)*/); } // Because the source values "move backwards" as the row index increases, the // indices derived from ystep are generally negative in localized functions. // This is accommodated by making sure the relative indices are within [-15, 0] // when the function is called, and sliding them into the inclusive range // [0, 15], relative to a lower base address. 15 is the Pixel offset, so 30 is // the byte offset for table lookups. constexpr int kPositiveIndexOffsetPixels = 15; constexpr int kPositiveIndexOffsetBytes = 30; inline void DirectionalZone2FromLeftCol_4xH( uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height, const uint16_t* LIBGAV1_RESTRICT const left_column, const int16x4_t left_y, const bool upsampled) { const int upsample_shift = static_cast(upsampled); const int index_scale_bits = 6; // The values in |offset_y| are negative, except for the first element, which // is zero. int16x4_t offset_y; int16x4_t shift_upsampled = left_y; // The shift argument must be a constant, otherwise use upsample_shift // directly. if (upsampled) { offset_y = vshr_n_s16(left_y, index_scale_bits - 1 /*upsample_shift*/); shift_upsampled = vshl_n_s16(shift_upsampled, 1); } else { offset_y = vshr_n_s16(left_y, index_scale_bits); } offset_y = vshl_n_s16(offset_y, 1); // Select values to the left of the starting point. // The 15th element (and 16th) will be all the way at the end, to the // right. With a negative ystep everything else will be "left" of them. // This supports cumulative steps up to 15. We could support up to 16 by // doing separate loads for |left_values| and |right_values|. vtbl // supports 2 Q registers as input which would allow for cumulative // offsets of 32. // |sampler_0| indexes the first byte of each 16-bit value. const int16x4_t sampler_0 = vadd_s16(offset_y, vdup_n_s16(kPositiveIndexOffsetBytes)); // |sampler_1| indexes the second byte of each 16-bit value. const int16x4_t sampler_1 = vadd_s16(sampler_0, vdup_n_s16(1)); const int16x4x2_t sampler = vzip_s16(sampler_0, sampler_1); const uint8x8_t left_indices = vqmovun_s16(vcombine_s16(sampler.val[0], sampler.val[1])); const uint8x8_t right_indices = vadd_u8(left_indices, vdup_n_u8(sizeof(uint16_t))); const int16x4_t shift_masked = vand_s16(shift_upsampled, vdup_n_s16(0x3f)); const uint16x4_t shift_0 = vreinterpret_u16_s16(vshr_n_s16(shift_masked, 1)); const uint16x4_t shift_1 = vsub_u16(vdup_n_u16(32), shift_0); int y = 0; do { uint16x4_t src_left, src_right; LoadStepwise( left_column - kPositiveIndexOffsetPixels + (y << upsample_shift), left_indices, right_indices, &src_left, &src_right); const uint16x4_t val = WeightedBlend(src_left, src_right, shift_1, shift_0); Store4(dst, val); dst += stride; } while (++y < height); } inline void DirectionalZone2FromLeftCol_8x8( uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const uint16_t* LIBGAV1_RESTRICT const left_column, const int16x8_t left_y, const bool upsampled) { const int upsample_shift = static_cast(upsampled); const int index_scale_bits = 6; // The values in |offset_y| are negative, except for the first element, which // is zero. int16x8_t offset_y; int16x8_t shift_upsampled = left_y; // The shift argument must be a constant, otherwise use upsample_shift // directly. if (upsampled) { offset_y = vshrq_n_s16(left_y, index_scale_bits - 1); shift_upsampled = vshlq_n_s16(shift_upsampled, 1); } else { offset_y = vshrq_n_s16(left_y, index_scale_bits); } offset_y = vshlq_n_s16(offset_y, 1); // Select values to the left of the starting point. // The 15th element (and 16th) will be all the way at the end, to the right. // With a negative ystep everything else will be "left" of them. // This supports cumulative steps up to 15. We could support up to 16 by doing // separate loads for |left_values| and |right_values|. vtbl supports 2 Q // registers as input which would allow for cumulative offsets of 32. // |sampler_0| indexes the first byte of each 16-bit value. const int16x8_t sampler_0 = vaddq_s16(offset_y, vdupq_n_s16(kPositiveIndexOffsetBytes)); // |sampler_1| indexes the second byte of each 16-bit value. const int16x8_t sampler_1 = vaddq_s16(sampler_0, vdupq_n_s16(1)); const int16x8x2_t sampler = vzipq_s16(sampler_0, sampler_1); const uint8x8_t left_values_0 = vqmovun_s16(sampler.val[0]); const uint8x8_t left_values_1 = vqmovun_s16(sampler.val[1]); const uint8x8_t right_values_0 = vadd_u8(left_values_0, vdup_n_u8(sizeof(uint16_t))); const uint8x8_t right_values_1 = vadd_u8(left_values_1, vdup_n_u8(sizeof(uint16_t))); const int16x8_t shift_masked = vandq_s16(shift_upsampled, vdupq_n_s16(0x3f)); const uint16x8_t shift_0 = vreinterpretq_u16_s16(vshrq_n_s16(shift_masked, 1)); const uint16x8_t shift_1 = vsubq_u16(vdupq_n_u16(32), shift_0); for (int y = 0; y < 8; ++y) { uint16x8_t src_left, src_right; LoadStepwise( left_column - kPositiveIndexOffsetPixels + (y << upsample_shift), left_values_0, right_values_0, left_values_1, right_values_1, &src_left, &src_right); const uint16x8_t val = WeightedBlend(src_left, src_right, shift_1, shift_0); Store8(dst, val); dst += stride; } } template inline void DirectionalZone1Blend_4xH( uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height, const uint16_t* LIBGAV1_RESTRICT const top_row, int zone_bounds, int top_x, const int xstep) { const int upsample_shift = static_cast(upsampled); const int scale_bits_x = 6 - upsample_shift; // Representing positions along the row, which |zone_bounds| will target for // the blending boundary. const int16x4_t indices = {0, 1, 2, 3}; uint16x4x2_t top_vals; int y = height; do { const uint16_t* const src = top_row + (top_x >> scale_bits_x); LoadEdgeVals(&top_vals, src, upsampled); const uint16_t shift_0 = ((top_x << upsample_shift) & 0x3f) >> 1; const uint16_t shift_1 = 32 - shift_0; const uint16x4_t val = WeightedBlend(top_vals.val[0], top_vals.val[1], shift_1, shift_0); const uint16x4_t dst_blend = Load4U16(dest); // |zone_bounds| values can be negative. const uint16x4_t blend = vcge_s16(indices, vdup_n_s16(zone_bounds >> 6)); const uint16x4_t output = vbsl_u16(blend, val, dst_blend); Store4(dest, output); dest += stride; zone_bounds += xstep; top_x -= xstep; } while (--y != 0); } template inline void DirectionalZone1Blend_8x8( uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const uint16_t* LIBGAV1_RESTRICT const top_row, int zone_bounds, int top_x, const int xstep) { const int upsample_shift = static_cast(upsampled); const int scale_bits_x = 6 - upsample_shift; // Representing positions along the row, which |zone_bounds| will target for // the blending boundary. const int16x8_t indices = {0, 1, 2, 3, 4, 5, 6, 7}; uint16x8x2_t top_vals; for (int y = 0; y < 8; ++y) { const uint16_t* const src = top_row + (top_x >> scale_bits_x); LoadEdgeVals(&top_vals, src, upsampled); const uint16_t shift_0 = ((top_x << upsample_shift) & 0x3f) >> 1; const uint16_t shift_1 = 32 - shift_0; const uint16x8_t val = WeightedBlend(top_vals.val[0], top_vals.val[1], shift_1, shift_0); const uint16x8_t dst_blend = Load8U16(dest); // |zone_bounds| values can be negative. const uint16x8_t blend = vcgeq_s16(indices, vdupq_n_s16(zone_bounds >> 6)); const uint16x8_t output = vbslq_u16(blend, val, dst_blend); Store8(dest, output); dest += stride; zone_bounds += xstep; top_x -= xstep; } } // 7.11.2.4 (8) 90 < angle > 180 // The strategy for these functions (4xH and 8+xH) is to know how many blocks // can be processed with just pixels from |top_ptr|, then handle mixed blocks, // then handle only blocks that take from |left_ptr|. Additionally, a fast // index-shuffle approach is used for pred values from |left_column| in sections // that permit it. template inline void DirectionalZone2_4xH( uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const uint16_t* LIBGAV1_RESTRICT const top_row, const uint16_t* LIBGAV1_RESTRICT const left_column, const int height, const int xstep, const int ystep) { const int upsample_left_shift = static_cast(upsampled_left); // Helper vector for index computation. const int16x4_t zero_to_three = {0, 1, 2, 3}; // Loop increments for moving by block (4xN). Vertical still steps by 8. If // it's only 4, it will be finished in the first iteration. const ptrdiff_t stride8 = stride << 3; const int xstep8 = xstep << 3; const int min_height = (height == 4) ? 4 : 8; // All columns from |min_top_only_x| to the right will only need |top_row| to // compute and can therefore call the Zone1 functions. This assumes |xstep| is // at least 3. assert(xstep >= 3); // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1 int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1; const int left_base_increment = ystep >> 6; const int ystep_remainder = ystep & 0x3F; // If the 64 scaling is regarded as a decimal point, the first value of the // left_y vector omits the portion which is covered under the left_column // offset. The following values need the full ystep as a relative offset. const int16x4_t left_y = vmla_n_s16(vdup_n_s16(-ystep_remainder), zero_to_three, -ystep); // This loop treats the 4 columns in 3 stages with y-value boundaries. // The first stage, before the first y-loop, covers blocks that are only // computed from the top row. The second stage, comprising two y-loops, covers // blocks that have a mixture of values computed from top or left. The final // stage covers blocks that are only computed from the left. // Round down to the nearest multiple of 8 (or 4, if height is 4). const int max_top_only_y = std::min((1 << 6) / xstep, height) & ~(min_height - 1); DirectionalZone1_4xH(reinterpret_cast(dst), stride >> 1, max_top_only_y, top_row, -xstep); if (max_top_only_y == height) return; int y = max_top_only_y; dst += stride * y; const int xstep_y = xstep * y; // All rows from |min_left_only_y| down for this set of columns only need // |left_column| to compute. const int min_left_only_y = std::min((4 /*width*/ << 6) / xstep, height); int xstep_bounds = xstep_bounds_base + xstep_y; int top_x = -xstep - xstep_y; // +8 increment is OK because if height is 4 this only runs once. for (; y < min_left_only_y; y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) { DirectionalZone2FromLeftCol_4xH( dst, stride, min_height, left_column + ((y - left_base_increment) << upsample_left_shift), left_y, upsampled_left); DirectionalZone1Blend_4xH(dst, stride, min_height, top_row, xstep_bounds, top_x, xstep); } // Left-only section. |height| - |y| is assumed equivalent to: // (y == 0) && (height == 4) if (height - y == 4) { DirectionalZone3_4x4(dst, stride, left_column, -ystep); return; } if (y < height) { DirectionalZone3_4xH( dst, stride, height - y, left_column + (y << upsample_left_shift), -ystep); } } // Process 8x4 and 16x4 blocks. This avoids a lot of overhead and simplifies // address safety. template inline void DirectionalZone2_Wx4( uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride, const uint16_t* LIBGAV1_RESTRICT const top_row, const uint16_t* LIBGAV1_RESTRICT const left_column, const int width, const int xstep, const int ystep) { const int upsample_top_shift = static_cast(upsampled_top); // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1 int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1; const int min_top_only_x = std::min((4 * xstep) >> 6, width); int x = 0; for (; x < min_top_only_x; x += 4, xstep_bounds_base -= (4 << 6)) { uint8_t* dst_x = dst + x * sizeof(uint16_t); // Round down to the nearest multiple of 4. const int max_top_only_y = (((x + 1) << 6) / xstep) & ~3; if (max_top_only_y != 0) { DirectionalZone1_4xH( reinterpret_cast(dst_x), stride >> 1, 4, top_row + (x << upsample_top_shift), -xstep); continue; } DirectionalZone3_4x4(dst_x, stride, left_column, -ystep, -ystep * x); const int min_left_only_y = ((x + 4) << 6) / xstep; if (min_left_only_y != 0) { const int top_x = -xstep; DirectionalZone1Blend_4xH( dst_x, stride, 4, top_row + (x << upsample_top_shift), xstep_bounds_base, top_x, xstep); } } // Reached |min_top_only_x|. for (; x < width; x += 4) { DirectionalZone1_4xH( reinterpret_cast(dst) + x, stride >> 1, 4, top_row + (x << upsample_top_shift), -xstep); } } template inline void DirectionalZone2_8xH( uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride, const uint16_t* LIBGAV1_RESTRICT const top_row, const uint16_t* LIBGAV1_RESTRICT const left_column, const int height, const int xstep, const int ystep, const int x, const int left_offset, const int xstep_bounds_base, const int16x8_t left_y) { const int upsample_left_shift = static_cast(upsampled_left); const int upsample_top_shift = static_cast(upsampled_top); // Loop incrementers for moving by block (8x8). This function handles blocks // with height 4 as well. They are calculated in one pass so these variables // do not get used. const ptrdiff_t stride8 = stride << 3; const int xstep8 = xstep << 3; // The first stage, before the first y-loop, covers blocks that are only // computed from the top row. The second stage, comprising two y-loops, covers // blocks that have a mixture of values computed from top or left. The final // stage covers blocks that are only computed from the left. uint8_t* dst_x = dst + x * sizeof(uint16_t); // Round down to the nearest multiple of 8. const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7; DirectionalZone1_WxH( reinterpret_cast(dst_x), stride >> 1, 8, max_top_only_y, top_row + (x << upsample_top_shift), -xstep); if (max_top_only_y == height) return; int y = max_top_only_y; dst_x += stride * y; const int xstep_y = xstep * y; // All rows from |min_left_only_y| down for this set of columns only need // |left_column| to compute. Round up to the nearest 8. const int min_left_only_y = Align(std::min(((x + 8) << 6) / xstep, height), 8); int xstep_bounds = xstep_bounds_base + xstep_y; int top_x = -xstep - xstep_y; for (; y < min_left_only_y; y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) { if (shuffle_left_column) { DirectionalZone2FromLeftCol_8x8( dst_x, stride, left_column + ((left_offset + y) << upsample_left_shift), left_y, upsampled_left); } else { DirectionalZone3_8x8( dst_x, stride, left_column + (y << upsample_left_shift), -ystep, -ystep * x); } DirectionalZone1Blend_8x8( dst_x, stride, top_row + (x << upsample_top_shift), xstep_bounds, top_x, xstep); } // Loop over y for left_only rows. for (; y < height; y += 8, dst_x += stride8) { DirectionalZone3_8x8( dst_x, stride, left_column + (y << upsample_left_shift), -ystep, -ystep * x); } } // Process a multiple of 8 |width|. template inline void DirectionalZone2_NEON( uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride, const uint16_t* LIBGAV1_RESTRICT const top_row, const uint16_t* LIBGAV1_RESTRICT const left_column, const int width, const int height, const int xstep, const int ystep) { if (height == 4) { DirectionalZone2_Wx4( dst, stride, top_row, left_column, width, xstep, ystep); return; } const int upsample_top_shift = static_cast(upsampled_top); // Helper vector. const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7}; const int ystep8 = ystep << 3; // All columns from |min_top_only_x| to the right will only need |top_row| to // compute and can therefore call the Zone1 functions. This assumes |xstep| is // at least 3. assert(xstep >= 3); const int min_top_only_x = Align(std::min((height * xstep) >> 6, width), 8); // Analysis finds that, for most angles (ystep < 132), all segments that use // both top_row and left_column can compute from left_column using byte // shuffles from a single vector. For steeper angles, the shuffle is also // fully reliable when x >= 32. const int shuffle_left_col_x = (ystep < 132) ? 0 : 32; const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x); // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1 int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1; const int left_base_increment = ystep >> 6; const int ystep_remainder = ystep & 0x3F; const int left_base_increment8 = ystep8 >> 6; const int ystep_remainder8 = ystep8 & 0x3F; const int16x8_t increment_left8 = vdupq_n_s16(ystep_remainder8); // If the 64 scaling is regarded as a decimal point, the first value of the // left_y vector omits the portion which is covered under the left_column // offset. Following values need the full ystep as a relative offset. int16x8_t left_y = vmlaq_n_s16(vdupq_n_s16(-ystep_remainder), zero_to_seven, -ystep); int x = 0; for (int left_offset = -left_base_increment; x < min_shuffle_x; x += 8, xstep_bounds_base -= (8 << 6), left_y = vsubq_s16(left_y, increment_left8), left_offset -= left_base_increment8) { DirectionalZone2_8xH( dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset, xstep_bounds_base, left_y); } for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8, xstep_bounds_base -= (8 << 6), left_y = vsubq_s16(left_y, increment_left8), left_offset -= left_base_increment8) { DirectionalZone2_8xH( dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset, xstep_bounds_base, left_y); } // Reached |min_top_only_x|. if (x < width) { DirectionalZone1_WxH( reinterpret_cast(dst) + x, stride >> 1, width - x, height, top_row + (x << upsample_top_shift), -xstep); } } // At this angle, neither edges are upsampled. // |min_width| is either 4 or 8. template void DirectionalAngle135(uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const uint16_t* LIBGAV1_RESTRICT const top, const uint16_t* LIBGAV1_RESTRICT const left, const int width, const int height) { // y = 0 is more trivial than the other rows. memcpy(dst, top - 1, width * sizeof(top[0])); dst += stride; // If |height| > |width|, then there is a point at which top_row is no longer // used in each row. const int min_left_only_y = std::min(width, height); int y = 1; do { // Example: If y is 4 (min_width), the dest row starts with left[3], // left[2], left[1], left[0], because the angle points up. Therefore, load // starts at left[0] and is then reversed. If y is 2, the load starts at // left[-2], and is reversed to store left[1], left[0], with negative values // overwritten from |top_row|. const uint16_t* const load_left = left + y - min_width; auto* dst16 = reinterpret_cast(dst); // Some values will be overwritten when |y| is not a multiple of // |min_width|. if (min_width == 4) { const uint16x4_t left_toward_corner = vrev64_u16(vld1_u16(load_left)); vst1_u16(dst16, left_toward_corner); } else { int x = 0; do { const uint16x8_t left_toward_corner = vrev64q_u16(vld1q_u16(load_left - x)); vst1_u16(dst16 + x, vget_high_u16(left_toward_corner)); vst1_u16(dst16 + x + 4, vget_low_u16(left_toward_corner)); x += 8; } while (x < y); } // Entering |top|. memcpy(dst16 + y, top - 1, (width - y) * sizeof(top[0])); dst += stride; } while (++y < min_left_only_y); // Left only. for (; y < height; ++y, dst += stride) { auto* dst16 = reinterpret_cast(dst); const uint16_t* const load_left = left + y - min_width; int x = 0; if (min_width == 4) { const uint16x4_t left_toward_corner = vrev64_u16(vld1_u16(load_left - x)); vst1_u16(dst16 + x, left_toward_corner); } else { do { const uint16x8_t left_toward_corner = vrev64q_u16(vld1q_u16(load_left - x)); vst1_u16(dst16 + x, vget_high_u16(left_toward_corner)); vst1_u16(dst16 + x + 4, vget_low_u16(left_toward_corner)); x += 8; } while (x < width); } } } void DirectionalIntraPredictorZone2_NEON( void* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const void* LIBGAV1_RESTRICT const top_row, const void* LIBGAV1_RESTRICT const left_column, const int width, const int height, const int xstep, const int ystep, const bool upsampled_top, const bool upsampled_left) { // Increasing the negative buffer for this function allows more rows to be // processed at a time without branching in an inner loop to check the base. uint16_t top_buffer[288]; uint16_t left_buffer[288]; #if LIBGAV1_MSAN memset(top_buffer, 0, sizeof(top_buffer)); memset(left_buffer, 0, sizeof(left_buffer)); #endif // LIBGAV1_MSAN memcpy(top_buffer + 128, static_cast(top_row) - 16, 160); memcpy(left_buffer + 128, static_cast(left_column) - 16, 160); const uint16_t* top_ptr = top_buffer + 144; const uint16_t* left_ptr = left_buffer + 144; auto* dst = static_cast(dest); if (width == 4) { if (xstep == 64) { assert(ystep == 64); DirectionalAngle135<4>(dst, stride, top_ptr, left_ptr, width, height); return; } if (upsampled_top) { if (upsampled_left) { DirectionalZone2_4xH(dst, stride, top_ptr, left_ptr, height, xstep, ystep); } else { DirectionalZone2_4xH(dst, stride, top_ptr, left_ptr, height, xstep, ystep); } } else if (upsampled_left) { DirectionalZone2_4xH(dst, stride, top_ptr, left_ptr, height, xstep, ystep); } else { DirectionalZone2_4xH(dst, stride, top_ptr, left_ptr, height, xstep, ystep); } return; } if (xstep == 64) { assert(ystep == 64); DirectionalAngle135<8>(dst, stride, top_ptr, left_ptr, width, height); return; } if (upsampled_top) { if (upsampled_left) { DirectionalZone2_NEON(dst, stride, top_ptr, left_ptr, width, height, xstep, ystep); } else { DirectionalZone2_NEON(dst, stride, top_ptr, left_ptr, width, height, xstep, ystep); } } else if (upsampled_left) { DirectionalZone2_NEON(dst, stride, top_ptr, left_ptr, width, height, xstep, ystep); } else { DirectionalZone2_NEON(dst, stride, top_ptr, left_ptr, width, height, xstep, ystep); } } void Init10bpp() { Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); assert(dsp != nullptr); dsp->directional_intra_predictor_zone1 = DirectionalIntraPredictorZone1_NEON; dsp->directional_intra_predictor_zone2 = DirectionalIntraPredictorZone2_NEON; dsp->directional_intra_predictor_zone3 = DirectionalIntraPredictorZone3_NEON; } } // namespace } // namespace high_bitdepth #endif // LIBGAV1_MAX_BITDEPTH >= 10 void IntraPredDirectionalInit_NEON() { low_bitdepth::Init8bpp(); #if LIBGAV1_MAX_BITDEPTH >= 10 high_bitdepth::Init10bpp(); #endif // LIBGAV1_MAX_BITDEPTH >= 10 } } // namespace dsp } // namespace libgav1 #else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { void IntraPredDirectionalInit_NEON() {} } // namespace dsp } // namespace libgav1 #endif // LIBGAV1_ENABLE_NEON