aboutsummaryrefslogtreecommitdiff
path: root/internal/output_sse.h
diff options
context:
space:
mode:
authorMiao Wang <miaowang@google.com>2018-02-23 23:31:32 +0000
committerandroid-build-merger <android-build-merger@google.com>2018-02-23 23:31:32 +0000
commit0ed4f31d5ced2432473aa7063bc1e28d990ff3f2 (patch)
treea6ece8759b2fc774b39edea08417e08fa633a73c /internal/output_sse.h
parent97962621d25000e4eda770f4dd399a4378fd6b8b (diff)
parent1f4ec3258fe3b77841065990a20fe2047464688b (diff)
downloadgemmlowp-oreo-mr1-1.2-iot-release.tar.gz
Rebase gemmlowp to ecae4d1 am: 7d0d5a611e am: 9fa88931b4android-wear-8.0.0_r2android-o-mr1-iot-release-smart-display-r9android-o-mr1-iot-release-smart-display-r8android-o-mr1-iot-release-smart-display-r5android-o-mr1-iot-release-smart-display-r40.1Jandroid-o-mr1-iot-release-smart-display-r4android-o-mr1-iot-release-smart-display-r39android-o-mr1-iot-release-smart-display-r30android-o-mr1-iot-release-smart-display-r3android-o-mr1-iot-release-smart-display-r22android-o-mr1-iot-release-smart-display-r14android-o-mr1-iot-release-smart-clock-r6android-o-mr1-iot-release-smart-clock-r2android-o-mr1-iot-release-smart-clock-fsiandroid-o-mr1-iot-release-smart-clock-fcsandroid-o-mr1-iot-release-cube_r2android-o-mr1-iot-release-cube-fsiandroid-o-mr1-iot-release-cube-fcsandroid-o-mr1-iot-release-1.0.5android-o-mr1-iot-release-1.0.4android-o-mr1-iot-release-1.0.3android-n-iot-release-ihome-igv1android-9.0.0_r47android-9.0.0_r46android-9.0.0_r45android-9.0.0_r44android-9.0.0_r43android-9.0.0_r42android-9.0.0_r41android-9.0.0_r40android-9.0.0_r39android-9.0.0_r38android-9.0.0_r37android-9.0.0_r36android-9.0.0_r35android-9.0.0_r34android-9.0.0_r33android-9.0.0_r32android-9.0.0_r31android-9.0.0_r30android-9.0.0_r22android-9.0.0_r21android-9.0.0_r20android-9.0.0_r19android-9.0.0_r16android-9.0.0_r12android-9.0.0_r11pie-qpr3-s1-releasepie-qpr3-releasepie-qpr3-b-releasepie-qpr2-releasepie-qpr1-s3-releasepie-qpr1-s2-releasepie-qpr1-s1-releasepie-qpr1-releasepie-dr1-releasepie-dr1-devpie-devpie-b4s4-releasepie-b4s4-devoreo-mr1-1.2-iot-releasenougat-iot-releasemaster-cuttlefish-testing-release
am: 1f4ec3258f Change-Id: Icb9df1558e7d87c03080597ffbb5a6212817cba6
Diffstat (limited to 'internal/output_sse.h')
-rw-r--r--internal/output_sse.h186
1 files changed, 186 insertions, 0 deletions
diff --git a/internal/output_sse.h b/internal/output_sse.h
index 5c06253..75aebfd 100644
--- a/internal/output_sse.h
+++ b/internal/output_sse.h
@@ -103,6 +103,82 @@ struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8,
}
};
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16,
+ RegBufferInt32<4>> {
+ typedef RegBufferInt32<4> InputType;
+ typedef RegBufferInt16<4> OutputType;
+
+ typedef OutputStageSaturatingCastToInt16 OutputStage;
+
+ OutputStageEvalBufferImpl(const OutputStage&) {}
+
+ OutputType Eval(InputType input) const {
+ OutputType output;
+ __m128i res_16 = _mm_packs_epi32(input.reg[0], input.reg[0]);
+ output.reg[0] = _mm_extract_epi16(res_16, 0);
+ output.reg[1] = _mm_extract_epi16(res_16, 1);
+ output.reg[2] = _mm_extract_epi16(res_16, 2);
+ output.reg[3] = _mm_extract_epi16(res_16, 3);
+ return output;
+ }
+};
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16,
+ RegBufferInt32<8>> {
+ typedef RegBufferInt32<8> InputType;
+ typedef RegBufferInt16<8> OutputType;
+
+ typedef OutputStageSaturatingCastToInt16 OutputStage;
+
+ OutputStageEvalBufferImpl(const OutputStage&) {}
+
+ OutputType Eval(InputType input) const {
+ OutputType output;
+ output.reg[0] = _mm_packs_epi32(input.reg[0], input.reg[1]);
+ return output;
+ }
+};
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16,
+ RegBufferInt32<16>> {
+ typedef RegBufferInt32<16> InputType;
+ typedef RegBufferInt16<16> OutputType;
+
+ typedef OutputStageSaturatingCastToInt16 OutputStage;
+
+ OutputStageEvalBufferImpl(const OutputStage&) {}
+
+ OutputType Eval(InputType input) const {
+ OutputType output;
+ output.reg[0] = _mm_packs_epi32(input.reg[0], input.reg[1]);
+ output.reg[1] = _mm_packs_epi32(input.reg[2], input.reg[3]);
+ return output;
+ }
+};
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16,
+ RegBufferInt32<32>> {
+ typedef RegBufferInt32<32> InputType;
+ typedef RegBufferInt16<32> OutputType;
+
+ typedef OutputStageSaturatingCastToInt16 OutputStage;
+
+ OutputStageEvalBufferImpl(const OutputStage&) {}
+
+ OutputType Eval(InputType input) const {
+ OutputType output;
+ output.reg[0] = _mm_packs_epi32(input.reg[0], input.reg[1]);
+ output.reg[1] = _mm_packs_epi32(input.reg[2], input.reg[3]);
+ output.reg[2] = _mm_packs_epi32(input.reg[4], input.reg[5]);
+ output.reg[3] = _mm_packs_epi32(input.reg[6], input.reg[7]);
+ return output;
+ }
+};
+
template <typename DstType>
struct StoreFinalOutputImpl<RegBlockInt32<4, 1>, DstType> {
static void Run(const RegBlockInt32<4, 1>& src, DstType* dst, int row,
@@ -138,6 +214,36 @@ struct StoreFinalOutputImpl<RegBlockInt32<8, 1>, DstType> {
}
};
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt16<4, 1>, DstType> {
+ static void Run(const RegBlockInt16<4, 1>& src, DstType* dst, int row,
+ int col) {
+ *dst->data(row + 0, col) = src.buf.reg[0];
+ *dst->data(row + 1, col) = src.buf.reg[1];
+ *dst->data(row + 2, col) = src.buf.reg[2];
+ *dst->data(row + 3, col) = src.buf.reg[3];
+ }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt16<8, 1>, DstType> {
+ static void Run(const RegBlockInt16<8, 1>& src, DstType* dst, int row,
+ int col) {
+ if (DstType::kOrder == MapOrder::ColMajor) {
+ StoreInt16x8(dst->data(row, col), src.buf.reg[0]);
+ } else {
+ *dst->data(row + 0, col) = _mm_extract_epi16(src.buf.reg[0], 0);
+ *dst->data(row + 1, col) = _mm_extract_epi16(src.buf.reg[0], 1);
+ *dst->data(row + 2, col) = _mm_extract_epi16(src.buf.reg[0], 2);
+ *dst->data(row + 3, col) = _mm_extract_epi16(src.buf.reg[0], 3);
+ *dst->data(row + 4, col) = _mm_extract_epi16(src.buf.reg[0], 4);
+ *dst->data(row + 5, col) = _mm_extract_epi16(src.buf.reg[0], 5);
+ *dst->data(row + 6, col) = _mm_extract_epi16(src.buf.reg[0], 6);
+ *dst->data(row + 7, col) = _mm_extract_epi16(src.buf.reg[0], 7);
+ }
+ }
+};
+
inline RegBlockInt32<4, 4> Transpose(const RegBlockInt32<4, 4>& src) {
__m128i t0 = _mm_unpacklo_epi32(src.buf.reg[0], src.buf.reg[1]);
__m128i t1 = _mm_unpacklo_epi32(src.buf.reg[2], src.buf.reg[3]);
@@ -170,6 +276,21 @@ struct StoreFinalOutputImpl<RegBlockInt32<4, 4>, DstType> {
};
template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt16<4, 4>, DstType> {
+ static void Run(const RegBlockInt16<4, 4>& src, DstType* dst, int row,
+ int col) {
+ std::int16_t buf[16];
+ StoreInt16x8(buf + 0, src.buf.reg[0]);
+ StoreInt16x8(buf + 8, src.buf.reg[1]);
+ for (int i = 0; i < 4; i++) {
+ for (int j = 0; j < 4; j++) {
+ *dst->data(row + i, col + j) = buf[i + 4 * j];
+ }
+ }
+ }
+};
+
+template <typename DstType>
struct StoreFinalOutputImpl<RegBlockInt32<8, 4>, DstType> {
static void Run(const RegBlockInt32<8, 4>& src, DstType* dst, int row,
int col) {
@@ -202,6 +323,29 @@ struct StoreFinalOutputImpl<RegBlockInt32<8, 4>, DstType> {
};
template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt16<8, 4>, DstType> {
+ static void Run(const RegBlockInt16<8, 4>& src, DstType* dst, int row,
+ int col) {
+ if (DstType::kOrder == MapOrder::ColMajor) {
+ for (int i = 0; i < 4; i++) {
+ StoreInt16x8(dst->data(row, col + i), src.buf.reg[i]);
+ }
+ } else {
+ std::int16_t buf[32];
+ StoreInt16x8(buf + 0, src.buf.reg[0]);
+ StoreInt16x8(buf + 8, src.buf.reg[1]);
+ StoreInt16x8(buf + 16, src.buf.reg[2]);
+ StoreInt16x8(buf + 24, src.buf.reg[3]);
+ for (int i = 0; i < 8; i++) {
+ for (int j = 0; j < 4; j++) {
+ *dst->data(row + i, col + j) = buf[i + 8 * j];
+ }
+ }
+ }
+ }
+};
+
+template <typename DstType>
struct StoreFinalOutputImpl<RegBlockInt32<8, 8>, DstType> {
static void Run(const RegBlockInt32<8, 8>& src, DstType* dst, int row,
int col) {
@@ -255,6 +399,48 @@ struct StoreFinalOutputImpl<RegBlockInt32<8, 8>, DstType> {
};
template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt16<8, 8>, DstType> {
+ static void Run(const RegBlockInt16<8, 8>& src, DstType* dst, int row,
+ int col) {
+ if (DstType::kOrder == MapOrder::ColMajor) {
+ for (int i = 0; i < 8; i++) {
+ StoreInt16x8(dst->data(row, col + i), src.buf.reg[i]);
+ }
+ } else {
+ // top-left 4x4
+ __m128i t0 = _mm_unpacklo_epi16(src.buf.reg[0], src.buf.reg[1]);
+ __m128i t1 = _mm_unpacklo_epi16(src.buf.reg[2], src.buf.reg[3]);
+ __m128i u0 = _mm_unpacklo_epi32(t0, t1);
+ __m128i u1 = _mm_unpackhi_epi32(t0, t1);
+ // top-right 4x4
+ __m128i t2 = _mm_unpacklo_epi16(src.buf.reg[4], src.buf.reg[5]);
+ __m128i t3 = _mm_unpacklo_epi16(src.buf.reg[6], src.buf.reg[7]);
+ __m128i u2 = _mm_unpacklo_epi32(t2, t3);
+ __m128i u3 = _mm_unpackhi_epi32(t2, t3);
+ // bottom-left 4x4
+ __m128i t4 = _mm_unpackhi_epi16(src.buf.reg[0], src.buf.reg[1]);
+ __m128i t5 = _mm_unpackhi_epi16(src.buf.reg[2], src.buf.reg[3]);
+ __m128i u4 = _mm_unpacklo_epi32(t4, t5);
+ __m128i u5 = _mm_unpackhi_epi32(t4, t5);
+ // bottom-right 4x4
+ __m128i t6 = _mm_unpackhi_epi16(src.buf.reg[4], src.buf.reg[5]);
+ __m128i t7 = _mm_unpackhi_epi16(src.buf.reg[6], src.buf.reg[7]);
+ __m128i u6 = _mm_unpacklo_epi32(t6, t7);
+ __m128i u7 = _mm_unpackhi_epi32(t6, t7);
+
+ StoreInt16x8(dst->data(row + 0, col), _mm_unpacklo_epi64(u0, u2));
+ StoreInt16x8(dst->data(row + 1, col), _mm_unpackhi_epi64(u0, u2));
+ StoreInt16x8(dst->data(row + 2, col), _mm_unpacklo_epi64(u1, u3));
+ StoreInt16x8(dst->data(row + 3, col), _mm_unpackhi_epi64(u1, u3));
+ StoreInt16x8(dst->data(row + 4, col), _mm_unpacklo_epi64(u4, u6));
+ StoreInt16x8(dst->data(row + 5, col), _mm_unpackhi_epi64(u4, u6));
+ StoreInt16x8(dst->data(row + 6, col), _mm_unpacklo_epi64(u5, u7));
+ StoreInt16x8(dst->data(row + 7, col), _mm_unpackhi_epi64(u5, u7));
+ }
+ }
+};
+
+template <typename DstType>
struct StoreFinalOutputImpl<RegBlockInt32<1, 4>, DstType> {
static void Run(const RegBlockInt32<1, 4>& src, DstType* dst, int row,
int col) {