diff options
Diffstat (limited to 'internal/output_sse.h')
-rw-r--r-- | internal/output_sse.h | 186 |
1 files changed, 186 insertions, 0 deletions
diff --git a/internal/output_sse.h b/internal/output_sse.h index 5c06253..75aebfd 100644 --- a/internal/output_sse.h +++ b/internal/output_sse.h @@ -103,6 +103,82 @@ struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8, } }; +template <> +struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16, + RegBufferInt32<4>> { + typedef RegBufferInt32<4> InputType; + typedef RegBufferInt16<4> OutputType; + + typedef OutputStageSaturatingCastToInt16 OutputStage; + + OutputStageEvalBufferImpl(const OutputStage&) {} + + OutputType Eval(InputType input) const { + OutputType output; + __m128i res_16 = _mm_packs_epi32(input.reg[0], input.reg[0]); + output.reg[0] = _mm_extract_epi16(res_16, 0); + output.reg[1] = _mm_extract_epi16(res_16, 1); + output.reg[2] = _mm_extract_epi16(res_16, 2); + output.reg[3] = _mm_extract_epi16(res_16, 3); + return output; + } +}; + +template <> +struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16, + RegBufferInt32<8>> { + typedef RegBufferInt32<8> InputType; + typedef RegBufferInt16<8> OutputType; + + typedef OutputStageSaturatingCastToInt16 OutputStage; + + OutputStageEvalBufferImpl(const OutputStage&) {} + + OutputType Eval(InputType input) const { + OutputType output; + output.reg[0] = _mm_packs_epi32(input.reg[0], input.reg[1]); + return output; + } +}; + +template <> +struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16, + RegBufferInt32<16>> { + typedef RegBufferInt32<16> InputType; + typedef RegBufferInt16<16> OutputType; + + typedef OutputStageSaturatingCastToInt16 OutputStage; + + OutputStageEvalBufferImpl(const OutputStage&) {} + + OutputType Eval(InputType input) const { + OutputType output; + output.reg[0] = _mm_packs_epi32(input.reg[0], input.reg[1]); + output.reg[1] = _mm_packs_epi32(input.reg[2], input.reg[3]); + return output; + } +}; + +template <> +struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16, + RegBufferInt32<32>> { + typedef RegBufferInt32<32> InputType; + typedef RegBufferInt16<32> OutputType; + + typedef OutputStageSaturatingCastToInt16 OutputStage; + + OutputStageEvalBufferImpl(const OutputStage&) {} + + OutputType Eval(InputType input) const { + OutputType output; + output.reg[0] = _mm_packs_epi32(input.reg[0], input.reg[1]); + output.reg[1] = _mm_packs_epi32(input.reg[2], input.reg[3]); + output.reg[2] = _mm_packs_epi32(input.reg[4], input.reg[5]); + output.reg[3] = _mm_packs_epi32(input.reg[6], input.reg[7]); + return output; + } +}; + template <typename DstType> struct StoreFinalOutputImpl<RegBlockInt32<4, 1>, DstType> { static void Run(const RegBlockInt32<4, 1>& src, DstType* dst, int row, @@ -138,6 +214,36 @@ struct StoreFinalOutputImpl<RegBlockInt32<8, 1>, DstType> { } }; +template <typename DstType> +struct StoreFinalOutputImpl<RegBlockInt16<4, 1>, DstType> { + static void Run(const RegBlockInt16<4, 1>& src, DstType* dst, int row, + int col) { + *dst->data(row + 0, col) = src.buf.reg[0]; + *dst->data(row + 1, col) = src.buf.reg[1]; + *dst->data(row + 2, col) = src.buf.reg[2]; + *dst->data(row + 3, col) = src.buf.reg[3]; + } +}; + +template <typename DstType> +struct StoreFinalOutputImpl<RegBlockInt16<8, 1>, DstType> { + static void Run(const RegBlockInt16<8, 1>& src, DstType* dst, int row, + int col) { + if (DstType::kOrder == MapOrder::ColMajor) { + StoreInt16x8(dst->data(row, col), src.buf.reg[0]); + } else { + *dst->data(row + 0, col) = _mm_extract_epi16(src.buf.reg[0], 0); + *dst->data(row + 1, col) = _mm_extract_epi16(src.buf.reg[0], 1); + *dst->data(row + 2, col) = _mm_extract_epi16(src.buf.reg[0], 2); + *dst->data(row + 3, col) = _mm_extract_epi16(src.buf.reg[0], 3); + *dst->data(row + 4, col) = _mm_extract_epi16(src.buf.reg[0], 4); + *dst->data(row + 5, col) = _mm_extract_epi16(src.buf.reg[0], 5); + *dst->data(row + 6, col) = _mm_extract_epi16(src.buf.reg[0], 6); + *dst->data(row + 7, col) = _mm_extract_epi16(src.buf.reg[0], 7); + } + } +}; + inline RegBlockInt32<4, 4> Transpose(const RegBlockInt32<4, 4>& src) { __m128i t0 = _mm_unpacklo_epi32(src.buf.reg[0], src.buf.reg[1]); __m128i t1 = _mm_unpacklo_epi32(src.buf.reg[2], src.buf.reg[3]); @@ -170,6 +276,21 @@ struct StoreFinalOutputImpl<RegBlockInt32<4, 4>, DstType> { }; template <typename DstType> +struct StoreFinalOutputImpl<RegBlockInt16<4, 4>, DstType> { + static void Run(const RegBlockInt16<4, 4>& src, DstType* dst, int row, + int col) { + std::int16_t buf[16]; + StoreInt16x8(buf + 0, src.buf.reg[0]); + StoreInt16x8(buf + 8, src.buf.reg[1]); + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + *dst->data(row + i, col + j) = buf[i + 4 * j]; + } + } + } +}; + +template <typename DstType> struct StoreFinalOutputImpl<RegBlockInt32<8, 4>, DstType> { static void Run(const RegBlockInt32<8, 4>& src, DstType* dst, int row, int col) { @@ -202,6 +323,29 @@ struct StoreFinalOutputImpl<RegBlockInt32<8, 4>, DstType> { }; template <typename DstType> +struct StoreFinalOutputImpl<RegBlockInt16<8, 4>, DstType> { + static void Run(const RegBlockInt16<8, 4>& src, DstType* dst, int row, + int col) { + if (DstType::kOrder == MapOrder::ColMajor) { + for (int i = 0; i < 4; i++) { + StoreInt16x8(dst->data(row, col + i), src.buf.reg[i]); + } + } else { + std::int16_t buf[32]; + StoreInt16x8(buf + 0, src.buf.reg[0]); + StoreInt16x8(buf + 8, src.buf.reg[1]); + StoreInt16x8(buf + 16, src.buf.reg[2]); + StoreInt16x8(buf + 24, src.buf.reg[3]); + for (int i = 0; i < 8; i++) { + for (int j = 0; j < 4; j++) { + *dst->data(row + i, col + j) = buf[i + 8 * j]; + } + } + } + } +}; + +template <typename DstType> struct StoreFinalOutputImpl<RegBlockInt32<8, 8>, DstType> { static void Run(const RegBlockInt32<8, 8>& src, DstType* dst, int row, int col) { @@ -255,6 +399,48 @@ struct StoreFinalOutputImpl<RegBlockInt32<8, 8>, DstType> { }; template <typename DstType> +struct StoreFinalOutputImpl<RegBlockInt16<8, 8>, DstType> { + static void Run(const RegBlockInt16<8, 8>& src, DstType* dst, int row, + int col) { + if (DstType::kOrder == MapOrder::ColMajor) { + for (int i = 0; i < 8; i++) { + StoreInt16x8(dst->data(row, col + i), src.buf.reg[i]); + } + } else { + // top-left 4x4 + __m128i t0 = _mm_unpacklo_epi16(src.buf.reg[0], src.buf.reg[1]); + __m128i t1 = _mm_unpacklo_epi16(src.buf.reg[2], src.buf.reg[3]); + __m128i u0 = _mm_unpacklo_epi32(t0, t1); + __m128i u1 = _mm_unpackhi_epi32(t0, t1); + // top-right 4x4 + __m128i t2 = _mm_unpacklo_epi16(src.buf.reg[4], src.buf.reg[5]); + __m128i t3 = _mm_unpacklo_epi16(src.buf.reg[6], src.buf.reg[7]); + __m128i u2 = _mm_unpacklo_epi32(t2, t3); + __m128i u3 = _mm_unpackhi_epi32(t2, t3); + // bottom-left 4x4 + __m128i t4 = _mm_unpackhi_epi16(src.buf.reg[0], src.buf.reg[1]); + __m128i t5 = _mm_unpackhi_epi16(src.buf.reg[2], src.buf.reg[3]); + __m128i u4 = _mm_unpacklo_epi32(t4, t5); + __m128i u5 = _mm_unpackhi_epi32(t4, t5); + // bottom-right 4x4 + __m128i t6 = _mm_unpackhi_epi16(src.buf.reg[4], src.buf.reg[5]); + __m128i t7 = _mm_unpackhi_epi16(src.buf.reg[6], src.buf.reg[7]); + __m128i u6 = _mm_unpacklo_epi32(t6, t7); + __m128i u7 = _mm_unpackhi_epi32(t6, t7); + + StoreInt16x8(dst->data(row + 0, col), _mm_unpacklo_epi64(u0, u2)); + StoreInt16x8(dst->data(row + 1, col), _mm_unpackhi_epi64(u0, u2)); + StoreInt16x8(dst->data(row + 2, col), _mm_unpacklo_epi64(u1, u3)); + StoreInt16x8(dst->data(row + 3, col), _mm_unpackhi_epi64(u1, u3)); + StoreInt16x8(dst->data(row + 4, col), _mm_unpacklo_epi64(u4, u6)); + StoreInt16x8(dst->data(row + 5, col), _mm_unpackhi_epi64(u4, u6)); + StoreInt16x8(dst->data(row + 6, col), _mm_unpacklo_epi64(u5, u7)); + StoreInt16x8(dst->data(row + 7, col), _mm_unpackhi_epi64(u5, u7)); + } + } +}; + +template <typename DstType> struct StoreFinalOutputImpl<RegBlockInt32<1, 4>, DstType> { static void Run(const RegBlockInt32<1, 4>& src, DstType* dst, int row, int col) { |