diff options
author | Miao Wang <miaowang@google.com> | 2018-02-23 23:31:32 +0000 |
---|---|---|
committer | android-build-merger <android-build-merger@google.com> | 2018-02-23 23:31:32 +0000 |
commit | 0ed4f31d5ced2432473aa7063bc1e28d990ff3f2 (patch) | |
tree | a6ece8759b2fc774b39edea08417e08fa633a73c /internal/output_sse.h | |
parent | 97962621d25000e4eda770f4dd399a4378fd6b8b (diff) | |
parent | 1f4ec3258fe3b77841065990a20fe2047464688b (diff) | |
download | gemmlowp-oreo-mr1-1.2-iot-release.tar.gz |
Rebase gemmlowp to ecae4d1 am: 7d0d5a611e am: 9fa88931b4android-wear-8.0.0_r2android-o-mr1-iot-release-smart-display-r9android-o-mr1-iot-release-smart-display-r8android-o-mr1-iot-release-smart-display-r5android-o-mr1-iot-release-smart-display-r40.1Jandroid-o-mr1-iot-release-smart-display-r4android-o-mr1-iot-release-smart-display-r39android-o-mr1-iot-release-smart-display-r30android-o-mr1-iot-release-smart-display-r3android-o-mr1-iot-release-smart-display-r22android-o-mr1-iot-release-smart-display-r14android-o-mr1-iot-release-smart-clock-r6android-o-mr1-iot-release-smart-clock-r2android-o-mr1-iot-release-smart-clock-fsiandroid-o-mr1-iot-release-smart-clock-fcsandroid-o-mr1-iot-release-cube_r2android-o-mr1-iot-release-cube-fsiandroid-o-mr1-iot-release-cube-fcsandroid-o-mr1-iot-release-1.0.5android-o-mr1-iot-release-1.0.4android-o-mr1-iot-release-1.0.3android-n-iot-release-ihome-igv1android-9.0.0_r47android-9.0.0_r46android-9.0.0_r45android-9.0.0_r44android-9.0.0_r43android-9.0.0_r42android-9.0.0_r41android-9.0.0_r40android-9.0.0_r39android-9.0.0_r38android-9.0.0_r37android-9.0.0_r36android-9.0.0_r35android-9.0.0_r34android-9.0.0_r33android-9.0.0_r32android-9.0.0_r31android-9.0.0_r30android-9.0.0_r22android-9.0.0_r21android-9.0.0_r20android-9.0.0_r19android-9.0.0_r16android-9.0.0_r12android-9.0.0_r11pie-qpr3-s1-releasepie-qpr3-releasepie-qpr3-b-releasepie-qpr2-releasepie-qpr1-s3-releasepie-qpr1-s2-releasepie-qpr1-s1-releasepie-qpr1-releasepie-dr1-releasepie-dr1-devpie-devpie-b4s4-releasepie-b4s4-devoreo-mr1-1.2-iot-releasenougat-iot-releasemaster-cuttlefish-testing-release
am: 1f4ec3258f
Change-Id: Icb9df1558e7d87c03080597ffbb5a6212817cba6
Diffstat (limited to 'internal/output_sse.h')
-rw-r--r-- | internal/output_sse.h | 186 |
1 files changed, 186 insertions, 0 deletions
diff --git a/internal/output_sse.h b/internal/output_sse.h index 5c06253..75aebfd 100644 --- a/internal/output_sse.h +++ b/internal/output_sse.h @@ -103,6 +103,82 @@ struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8, } }; +template <> +struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16, + RegBufferInt32<4>> { + typedef RegBufferInt32<4> InputType; + typedef RegBufferInt16<4> OutputType; + + typedef OutputStageSaturatingCastToInt16 OutputStage; + + OutputStageEvalBufferImpl(const OutputStage&) {} + + OutputType Eval(InputType input) const { + OutputType output; + __m128i res_16 = _mm_packs_epi32(input.reg[0], input.reg[0]); + output.reg[0] = _mm_extract_epi16(res_16, 0); + output.reg[1] = _mm_extract_epi16(res_16, 1); + output.reg[2] = _mm_extract_epi16(res_16, 2); + output.reg[3] = _mm_extract_epi16(res_16, 3); + return output; + } +}; + +template <> +struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16, + RegBufferInt32<8>> { + typedef RegBufferInt32<8> InputType; + typedef RegBufferInt16<8> OutputType; + + typedef OutputStageSaturatingCastToInt16 OutputStage; + + OutputStageEvalBufferImpl(const OutputStage&) {} + + OutputType Eval(InputType input) const { + OutputType output; + output.reg[0] = _mm_packs_epi32(input.reg[0], input.reg[1]); + return output; + } +}; + +template <> +struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16, + RegBufferInt32<16>> { + typedef RegBufferInt32<16> InputType; + typedef RegBufferInt16<16> OutputType; + + typedef OutputStageSaturatingCastToInt16 OutputStage; + + OutputStageEvalBufferImpl(const OutputStage&) {} + + OutputType Eval(InputType input) const { + OutputType output; + output.reg[0] = _mm_packs_epi32(input.reg[0], input.reg[1]); + output.reg[1] = _mm_packs_epi32(input.reg[2], input.reg[3]); + return output; + } +}; + +template <> +struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16, + RegBufferInt32<32>> { + typedef RegBufferInt32<32> InputType; + typedef RegBufferInt16<32> OutputType; + + typedef OutputStageSaturatingCastToInt16 OutputStage; + + OutputStageEvalBufferImpl(const OutputStage&) {} + + OutputType Eval(InputType input) const { + OutputType output; + output.reg[0] = _mm_packs_epi32(input.reg[0], input.reg[1]); + output.reg[1] = _mm_packs_epi32(input.reg[2], input.reg[3]); + output.reg[2] = _mm_packs_epi32(input.reg[4], input.reg[5]); + output.reg[3] = _mm_packs_epi32(input.reg[6], input.reg[7]); + return output; + } +}; + template <typename DstType> struct StoreFinalOutputImpl<RegBlockInt32<4, 1>, DstType> { static void Run(const RegBlockInt32<4, 1>& src, DstType* dst, int row, @@ -138,6 +214,36 @@ struct StoreFinalOutputImpl<RegBlockInt32<8, 1>, DstType> { } }; +template <typename DstType> +struct StoreFinalOutputImpl<RegBlockInt16<4, 1>, DstType> { + static void Run(const RegBlockInt16<4, 1>& src, DstType* dst, int row, + int col) { + *dst->data(row + 0, col) = src.buf.reg[0]; + *dst->data(row + 1, col) = src.buf.reg[1]; + *dst->data(row + 2, col) = src.buf.reg[2]; + *dst->data(row + 3, col) = src.buf.reg[3]; + } +}; + +template <typename DstType> +struct StoreFinalOutputImpl<RegBlockInt16<8, 1>, DstType> { + static void Run(const RegBlockInt16<8, 1>& src, DstType* dst, int row, + int col) { + if (DstType::kOrder == MapOrder::ColMajor) { + StoreInt16x8(dst->data(row, col), src.buf.reg[0]); + } else { + *dst->data(row + 0, col) = _mm_extract_epi16(src.buf.reg[0], 0); + *dst->data(row + 1, col) = _mm_extract_epi16(src.buf.reg[0], 1); + *dst->data(row + 2, col) = _mm_extract_epi16(src.buf.reg[0], 2); + *dst->data(row + 3, col) = _mm_extract_epi16(src.buf.reg[0], 3); + *dst->data(row + 4, col) = _mm_extract_epi16(src.buf.reg[0], 4); + *dst->data(row + 5, col) = _mm_extract_epi16(src.buf.reg[0], 5); + *dst->data(row + 6, col) = _mm_extract_epi16(src.buf.reg[0], 6); + *dst->data(row + 7, col) = _mm_extract_epi16(src.buf.reg[0], 7); + } + } +}; + inline RegBlockInt32<4, 4> Transpose(const RegBlockInt32<4, 4>& src) { __m128i t0 = _mm_unpacklo_epi32(src.buf.reg[0], src.buf.reg[1]); __m128i t1 = _mm_unpacklo_epi32(src.buf.reg[2], src.buf.reg[3]); @@ -170,6 +276,21 @@ struct StoreFinalOutputImpl<RegBlockInt32<4, 4>, DstType> { }; template <typename DstType> +struct StoreFinalOutputImpl<RegBlockInt16<4, 4>, DstType> { + static void Run(const RegBlockInt16<4, 4>& src, DstType* dst, int row, + int col) { + std::int16_t buf[16]; + StoreInt16x8(buf + 0, src.buf.reg[0]); + StoreInt16x8(buf + 8, src.buf.reg[1]); + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + *dst->data(row + i, col + j) = buf[i + 4 * j]; + } + } + } +}; + +template <typename DstType> struct StoreFinalOutputImpl<RegBlockInt32<8, 4>, DstType> { static void Run(const RegBlockInt32<8, 4>& src, DstType* dst, int row, int col) { @@ -202,6 +323,29 @@ struct StoreFinalOutputImpl<RegBlockInt32<8, 4>, DstType> { }; template <typename DstType> +struct StoreFinalOutputImpl<RegBlockInt16<8, 4>, DstType> { + static void Run(const RegBlockInt16<8, 4>& src, DstType* dst, int row, + int col) { + if (DstType::kOrder == MapOrder::ColMajor) { + for (int i = 0; i < 4; i++) { + StoreInt16x8(dst->data(row, col + i), src.buf.reg[i]); + } + } else { + std::int16_t buf[32]; + StoreInt16x8(buf + 0, src.buf.reg[0]); + StoreInt16x8(buf + 8, src.buf.reg[1]); + StoreInt16x8(buf + 16, src.buf.reg[2]); + StoreInt16x8(buf + 24, src.buf.reg[3]); + for (int i = 0; i < 8; i++) { + for (int j = 0; j < 4; j++) { + *dst->data(row + i, col + j) = buf[i + 8 * j]; + } + } + } + } +}; + +template <typename DstType> struct StoreFinalOutputImpl<RegBlockInt32<8, 8>, DstType> { static void Run(const RegBlockInt32<8, 8>& src, DstType* dst, int row, int col) { @@ -255,6 +399,48 @@ struct StoreFinalOutputImpl<RegBlockInt32<8, 8>, DstType> { }; template <typename DstType> +struct StoreFinalOutputImpl<RegBlockInt16<8, 8>, DstType> { + static void Run(const RegBlockInt16<8, 8>& src, DstType* dst, int row, + int col) { + if (DstType::kOrder == MapOrder::ColMajor) { + for (int i = 0; i < 8; i++) { + StoreInt16x8(dst->data(row, col + i), src.buf.reg[i]); + } + } else { + // top-left 4x4 + __m128i t0 = _mm_unpacklo_epi16(src.buf.reg[0], src.buf.reg[1]); + __m128i t1 = _mm_unpacklo_epi16(src.buf.reg[2], src.buf.reg[3]); + __m128i u0 = _mm_unpacklo_epi32(t0, t1); + __m128i u1 = _mm_unpackhi_epi32(t0, t1); + // top-right 4x4 + __m128i t2 = _mm_unpacklo_epi16(src.buf.reg[4], src.buf.reg[5]); + __m128i t3 = _mm_unpacklo_epi16(src.buf.reg[6], src.buf.reg[7]); + __m128i u2 = _mm_unpacklo_epi32(t2, t3); + __m128i u3 = _mm_unpackhi_epi32(t2, t3); + // bottom-left 4x4 + __m128i t4 = _mm_unpackhi_epi16(src.buf.reg[0], src.buf.reg[1]); + __m128i t5 = _mm_unpackhi_epi16(src.buf.reg[2], src.buf.reg[3]); + __m128i u4 = _mm_unpacklo_epi32(t4, t5); + __m128i u5 = _mm_unpackhi_epi32(t4, t5); + // bottom-right 4x4 + __m128i t6 = _mm_unpackhi_epi16(src.buf.reg[4], src.buf.reg[5]); + __m128i t7 = _mm_unpackhi_epi16(src.buf.reg[6], src.buf.reg[7]); + __m128i u6 = _mm_unpacklo_epi32(t6, t7); + __m128i u7 = _mm_unpackhi_epi32(t6, t7); + + StoreInt16x8(dst->data(row + 0, col), _mm_unpacklo_epi64(u0, u2)); + StoreInt16x8(dst->data(row + 1, col), _mm_unpackhi_epi64(u0, u2)); + StoreInt16x8(dst->data(row + 2, col), _mm_unpacklo_epi64(u1, u3)); + StoreInt16x8(dst->data(row + 3, col), _mm_unpackhi_epi64(u1, u3)); + StoreInt16x8(dst->data(row + 4, col), _mm_unpacklo_epi64(u4, u6)); + StoreInt16x8(dst->data(row + 5, col), _mm_unpackhi_epi64(u4, u6)); + StoreInt16x8(dst->data(row + 6, col), _mm_unpacklo_epi64(u5, u7)); + StoreInt16x8(dst->data(row + 7, col), _mm_unpackhi_epi64(u5, u7)); + } + } +}; + +template <typename DstType> struct StoreFinalOutputImpl<RegBlockInt32<1, 4>, DstType> { static void Run(const RegBlockInt32<1, 4>& src, DstType* dst, int row, int col) { |