aboutsummaryrefslogtreecommitdiff
path: root/internal/single_thread_gemm.h
blob: 3d430c5d48a03e4b4ec3da78ca649a20bd96a071 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// single_thread_gemm.h: Single-threaded GEMM implementation.
// This is a good place to start reading code, as it shows the overall
// structure of a GEMM and is much simpler than multi_thread_gemm.h.

#ifndef GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_
#define GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_

#include <cassert>

#include "../public/map.h"
#include "allocator.h"
#include "compute.h"
#include "kernel.h"
#include "pack.h"
#include "unpack.h"

#ifdef GEMMLOWP_PROFILING_SIZES
#ifndef GEMMLOWP_PROFILING
#error GEMMLOWP_PROFILING_SIZES without GEMMLOWP_PROFILING
#endif
#include <string>
#include <unordered_map>
#endif

namespace gemmlowp {

class SingleThreadGemmContext {
 public:
  Allocator* allocator() { return &allocator_; }

  void set_l1_bytes_to_use(int n) { l1_bytes_to_use_ = n; }
  void set_l2_bytes_to_use(int n) { l2_bytes_to_use_ = n; }
  void set_l2_rhs_factor(float n) { l2_rhs_factor_ = n; }

  int l1_bytes_to_use() const { return l1_bytes_to_use_; }
  int l2_bytes_to_use() const { return l2_bytes_to_use_; }
  float l2_rhs_factor() const { return l2_rhs_factor_; }

 protected:
  Allocator allocator_;

  // The cache configurationt to use.
  int l1_bytes_to_use_ = kDefaultL1CacheSize;
  int l2_bytes_to_use_ = kDefaultL2CacheSize;
  float l2_rhs_factor_ = kDefaultL2RhsFactor;
};

template <typename KernelFormat, typename InputScalar, typename OutputScalar,
          typename BitDepthParams, MapOrder LhsOrder, MapOrder RhsOrder,
          MapOrder ResultOrder, typename LhsOffset, typename RhsOffset,
          typename OutputPipelineType>
void SingleThreadGemm(SingleThreadGemmContext* context,
                      const KernelBase& kernel,
                      const MatrixMap<const InputScalar, LhsOrder>& lhs,
                      const MatrixMap<const InputScalar, RhsOrder>& rhs,
                      MatrixMap<OutputScalar, ResultOrder>* result,
                      const LhsOffset& lhs_offset, const RhsOffset& rhs_offset,
                      const OutputPipelineType& output_pipeline) {
  ScopedProfilingLabel label("gemmlowp::SingleThreadGemm");

  assert(lhs.cols() == rhs.rows());

  int rows = result->rows();
  int cols = result->cols();
  int depth = lhs.cols();

  // zero sizes should have been caught earlier and early-returned.
  assert(rows > 0);
  assert(cols > 0);
  assert(depth > 0);

  // The case of rows<cols should have been caught earlier and transposed.
  assert(rows >= cols);

  Allocator* allocator = context->allocator();

  BlockParams block_params;
  block_params.Init<KernelFormat>(rows, cols, depth, 1,
                                  context->l1_bytes_to_use(),
                                  context->l2_bytes_to_use(),
                                  context->l2_rhs_factor());

#ifdef GEMMLOWP_PROFILING_SIZES
  // Using a static map of label strings. Not reentrant at all!
  static std::unordered_map<std::uint64_t, std::string> labels_map;
  std::uint64_t sizes_hash = static_cast<std::uint64_t>(rows) ^
                             (static_cast<std::uint64_t>(depth) << 16) ^
                             (static_cast<std::uint64_t>(cols) << 32);
  if (!labels_map.count(sizes_hash)) {
    char label[256];
    snprintf(label, sizeof(label),
             "(rows = %d, depth = %d, cols = %d, l2_rows = %d, l2_depth = %d, "
             "l2_cols = %d, l1_rows = %d, l1_depth = %d, l1_cols = %d)",
             rows, depth, cols, block_params.l2_rows, block_params.l2_depth,
             block_params.l2_cols, block_params.l1_rows, block_params.l1_depth,
             block_params.l1_cols);
    labels_map[sizes_hash] = label;
  }
  ScopedProfilingLabel size_label(labels_map[sizes_hash].c_str());
#endif

  PackedSideBlock<typename KernelFormat::Lhs> packed_lhs(Side::Lhs, allocator,
                                                         block_params);
  PackedSideBlock<typename KernelFormat::Rhs> packed_rhs(Side::Rhs, allocator,
                                                         block_params);

  PackedResult packed_result(allocator, block_params);

  allocator->Commit();

  const bool pack_rhs_once = block_params.l2_cols >= cols;

  if (pack_rhs_once) {
    PackRhs(&packed_rhs, rhs);
  }

  for (int r = 0; r < rows; r += block_params.l2_rows) {
    int rs = std::min(block_params.l2_rows, rows - r);

    PackLhs(&packed_lhs, lhs.block(r, 0, rs, depth));

    for (int c = 0; c < cols; c += block_params.l2_cols) {
      int cs = std::min(block_params.l2_cols, cols - c);

      if (!pack_rhs_once) {
        PackRhs(&packed_rhs, rhs.block(0, c, depth, cs));
      }

      Compute(kernel, block_params, &packed_result, packed_lhs, packed_rhs,
              depth);

      UnpackResult<KernelFormat>(
          result, MatrixBlockBounds(r, c, rs, cs), packed_result, depth,
          packed_lhs.sums_of_each_slice(), packed_rhs.sums_of_each_slice(),
          lhs_offset.block(r, rs), rhs_offset.block(c, cs), output_pipeline);
    }
  }

  allocator->Decommit();
}

}  // namespace gemmlowp

#endif  // GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_