aboutsummaryrefslogtreecommitdiff
path: root/internal/output_sse.h
diff options
context:
space:
mode:
Diffstat (limited to 'internal/output_sse.h')
-rw-r--r--internal/output_sse.h186
1 files changed, 186 insertions, 0 deletions
diff --git a/internal/output_sse.h b/internal/output_sse.h
index 5c06253..75aebfd 100644
--- a/internal/output_sse.h
+++ b/internal/output_sse.h
@@ -103,6 +103,82 @@ struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8,
}
};
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16,
+ RegBufferInt32<4>> {
+ typedef RegBufferInt32<4> InputType;
+ typedef RegBufferInt16<4> OutputType;
+
+ typedef OutputStageSaturatingCastToInt16 OutputStage;
+
+ OutputStageEvalBufferImpl(const OutputStage&) {}
+
+ OutputType Eval(InputType input) const {
+ OutputType output;
+ __m128i res_16 = _mm_packs_epi32(input.reg[0], input.reg[0]);
+ output.reg[0] = _mm_extract_epi16(res_16, 0);
+ output.reg[1] = _mm_extract_epi16(res_16, 1);
+ output.reg[2] = _mm_extract_epi16(res_16, 2);
+ output.reg[3] = _mm_extract_epi16(res_16, 3);
+ return output;
+ }
+};
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16,
+ RegBufferInt32<8>> {
+ typedef RegBufferInt32<8> InputType;
+ typedef RegBufferInt16<8> OutputType;
+
+ typedef OutputStageSaturatingCastToInt16 OutputStage;
+
+ OutputStageEvalBufferImpl(const OutputStage&) {}
+
+ OutputType Eval(InputType input) const {
+ OutputType output;
+ output.reg[0] = _mm_packs_epi32(input.reg[0], input.reg[1]);
+ return output;
+ }
+};
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16,
+ RegBufferInt32<16>> {
+ typedef RegBufferInt32<16> InputType;
+ typedef RegBufferInt16<16> OutputType;
+
+ typedef OutputStageSaturatingCastToInt16 OutputStage;
+
+ OutputStageEvalBufferImpl(const OutputStage&) {}
+
+ OutputType Eval(InputType input) const {
+ OutputType output;
+ output.reg[0] = _mm_packs_epi32(input.reg[0], input.reg[1]);
+ output.reg[1] = _mm_packs_epi32(input.reg[2], input.reg[3]);
+ return output;
+ }
+};
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16,
+ RegBufferInt32<32>> {
+ typedef RegBufferInt32<32> InputType;
+ typedef RegBufferInt16<32> OutputType;
+
+ typedef OutputStageSaturatingCastToInt16 OutputStage;
+
+ OutputStageEvalBufferImpl(const OutputStage&) {}
+
+ OutputType Eval(InputType input) const {
+ OutputType output;
+ output.reg[0] = _mm_packs_epi32(input.reg[0], input.reg[1]);
+ output.reg[1] = _mm_packs_epi32(input.reg[2], input.reg[3]);
+ output.reg[2] = _mm_packs_epi32(input.reg[4], input.reg[5]);
+ output.reg[3] = _mm_packs_epi32(input.reg[6], input.reg[7]);
+ return output;
+ }
+};
+
template <typename DstType>
struct StoreFinalOutputImpl<RegBlockInt32<4, 1>, DstType> {
static void Run(const RegBlockInt32<4, 1>& src, DstType* dst, int row,
@@ -138,6 +214,36 @@ struct StoreFinalOutputImpl<RegBlockInt32<8, 1>, DstType> {
}
};
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt16<4, 1>, DstType> {
+ static void Run(const RegBlockInt16<4, 1>& src, DstType* dst, int row,
+ int col) {
+ *dst->data(row + 0, col) = src.buf.reg[0];
+ *dst->data(row + 1, col) = src.buf.reg[1];
+ *dst->data(row + 2, col) = src.buf.reg[2];
+ *dst->data(row + 3, col) = src.buf.reg[3];
+ }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt16<8, 1>, DstType> {
+ static void Run(const RegBlockInt16<8, 1>& src, DstType* dst, int row,
+ int col) {
+ if (DstType::kOrder == MapOrder::ColMajor) {
+ StoreInt16x8(dst->data(row, col), src.buf.reg[0]);
+ } else {
+ *dst->data(row + 0, col) = _mm_extract_epi16(src.buf.reg[0], 0);
+ *dst->data(row + 1, col) = _mm_extract_epi16(src.buf.reg[0], 1);
+ *dst->data(row + 2, col) = _mm_extract_epi16(src.buf.reg[0], 2);
+ *dst->data(row + 3, col) = _mm_extract_epi16(src.buf.reg[0], 3);
+ *dst->data(row + 4, col) = _mm_extract_epi16(src.buf.reg[0], 4);
+ *dst->data(row + 5, col) = _mm_extract_epi16(src.buf.reg[0], 5);
+ *dst->data(row + 6, col) = _mm_extract_epi16(src.buf.reg[0], 6);
+ *dst->data(row + 7, col) = _mm_extract_epi16(src.buf.reg[0], 7);
+ }
+ }
+};
+
inline RegBlockInt32<4, 4> Transpose(const RegBlockInt32<4, 4>& src) {
__m128i t0 = _mm_unpacklo_epi32(src.buf.reg[0], src.buf.reg[1]);
__m128i t1 = _mm_unpacklo_epi32(src.buf.reg[2], src.buf.reg[3]);
@@ -170,6 +276,21 @@ struct StoreFinalOutputImpl<RegBlockInt32<4, 4>, DstType> {
};
template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt16<4, 4>, DstType> {
+ static void Run(const RegBlockInt16<4, 4>& src, DstType* dst, int row,
+ int col) {
+ std::int16_t buf[16];
+ StoreInt16x8(buf + 0, src.buf.reg[0]);
+ StoreInt16x8(buf + 8, src.buf.reg[1]);
+ for (int i = 0; i < 4; i++) {
+ for (int j = 0; j < 4; j++) {
+ *dst->data(row + i, col + j) = buf[i + 4 * j];
+ }
+ }
+ }
+};
+
+template <typename DstType>
struct StoreFinalOutputImpl<RegBlockInt32<8, 4>, DstType> {
static void Run(const RegBlockInt32<8, 4>& src, DstType* dst, int row,
int col) {
@@ -202,6 +323,29 @@ struct StoreFinalOutputImpl<RegBlockInt32<8, 4>, DstType> {
};
template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt16<8, 4>, DstType> {
+ static void Run(const RegBlockInt16<8, 4>& src, DstType* dst, int row,
+ int col) {
+ if (DstType::kOrder == MapOrder::ColMajor) {
+ for (int i = 0; i < 4; i++) {
+ StoreInt16x8(dst->data(row, col + i), src.buf.reg[i]);
+ }
+ } else {
+ std::int16_t buf[32];
+ StoreInt16x8(buf + 0, src.buf.reg[0]);
+ StoreInt16x8(buf + 8, src.buf.reg[1]);
+ StoreInt16x8(buf + 16, src.buf.reg[2]);
+ StoreInt16x8(buf + 24, src.buf.reg[3]);
+ for (int i = 0; i < 8; i++) {
+ for (int j = 0; j < 4; j++) {
+ *dst->data(row + i, col + j) = buf[i + 8 * j];
+ }
+ }
+ }
+ }
+};
+
+template <typename DstType>
struct StoreFinalOutputImpl<RegBlockInt32<8, 8>, DstType> {
static void Run(const RegBlockInt32<8, 8>& src, DstType* dst, int row,
int col) {
@@ -255,6 +399,48 @@ struct StoreFinalOutputImpl<RegBlockInt32<8, 8>, DstType> {
};
template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt16<8, 8>, DstType> {
+ static void Run(const RegBlockInt16<8, 8>& src, DstType* dst, int row,
+ int col) {
+ if (DstType::kOrder == MapOrder::ColMajor) {
+ for (int i = 0; i < 8; i++) {
+ StoreInt16x8(dst->data(row, col + i), src.buf.reg[i]);
+ }
+ } else {
+ // top-left 4x4
+ __m128i t0 = _mm_unpacklo_epi16(src.buf.reg[0], src.buf.reg[1]);
+ __m128i t1 = _mm_unpacklo_epi16(src.buf.reg[2], src.buf.reg[3]);
+ __m128i u0 = _mm_unpacklo_epi32(t0, t1);
+ __m128i u1 = _mm_unpackhi_epi32(t0, t1);
+ // top-right 4x4
+ __m128i t2 = _mm_unpacklo_epi16(src.buf.reg[4], src.buf.reg[5]);
+ __m128i t3 = _mm_unpacklo_epi16(src.buf.reg[6], src.buf.reg[7]);
+ __m128i u2 = _mm_unpacklo_epi32(t2, t3);
+ __m128i u3 = _mm_unpackhi_epi32(t2, t3);
+ // bottom-left 4x4
+ __m128i t4 = _mm_unpackhi_epi16(src.buf.reg[0], src.buf.reg[1]);
+ __m128i t5 = _mm_unpackhi_epi16(src.buf.reg[2], src.buf.reg[3]);
+ __m128i u4 = _mm_unpacklo_epi32(t4, t5);
+ __m128i u5 = _mm_unpackhi_epi32(t4, t5);
+ // bottom-right 4x4
+ __m128i t6 = _mm_unpackhi_epi16(src.buf.reg[4], src.buf.reg[5]);
+ __m128i t7 = _mm_unpackhi_epi16(src.buf.reg[6], src.buf.reg[7]);
+ __m128i u6 = _mm_unpacklo_epi32(t6, t7);
+ __m128i u7 = _mm_unpackhi_epi32(t6, t7);
+
+ StoreInt16x8(dst->data(row + 0, col), _mm_unpacklo_epi64(u0, u2));
+ StoreInt16x8(dst->data(row + 1, col), _mm_unpackhi_epi64(u0, u2));
+ StoreInt16x8(dst->data(row + 2, col), _mm_unpacklo_epi64(u1, u3));
+ StoreInt16x8(dst->data(row + 3, col), _mm_unpackhi_epi64(u1, u3));
+ StoreInt16x8(dst->data(row + 4, col), _mm_unpacklo_epi64(u4, u6));
+ StoreInt16x8(dst->data(row + 5, col), _mm_unpackhi_epi64(u4, u6));
+ StoreInt16x8(dst->data(row + 6, col), _mm_unpacklo_epi64(u5, u7));
+ StoreInt16x8(dst->data(row + 7, col), _mm_unpackhi_epi64(u5, u7));
+ }
+ }
+};
+
+template <typename DstType>
struct StoreFinalOutputImpl<RegBlockInt32<1, 4>, DstType> {
static void Run(const RegBlockInt32<1, 4>& src, DstType* dst, int row,
int col) {