internal/common.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184

// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// common.h: contains stuff that's used throughout gemmlowp
// and should always be available.

#ifndef GEMMLOWP_INTERNAL_COMMON_H_
#define GEMMLOWP_INTERNAL_COMMON_H_

#include "../internal/platform.h"
#include "../profiling/pthread_everywhere.h"

#include <algorithm>
#include <cassert>
#include <cmath>
#include <cstdlib>

#include "../internal/detect_platform.h"
#include "../profiling/instrumentation.h"

namespace gemmlowp {

// Standard cache line size. Useful to optimize alignment and
// prefetches. Ideally we would query this at runtime, however
// 64 byte cache lines are the vast majority, and even if it's
// wrong on some device, it will be wrong by no more than a 2x factor,
// which should be acceptable.
const int kDefaultCacheLineSize = 64;

// Default L1 and L2 data cache sizes.
// The L1 cache size is assumed to be for each core.
// The L2 cache size is assumed to be shared among all cores. What
// we call 'L2' here is effectively top-level cache.
//
// On x86, we should ideally query this at
// runtime. On ARM, the instruction to query this is privileged and
// Android kernels do not expose it to userspace. Fortunately, the majority
// of ARM devices have roughly comparable values:
//   Nexus 5: L1 16k, L2 1M
//   Android One: L1 32k, L2 512k
// The following values are equal to or somewhat lower than that, and were
// found to perform well on both the Nexus 5 and Android One.
// Of course, these values are in principle too low for typical x86 CPUs
// where we should set the L2 value to (L3 cache size / number of cores) at
// least.
//
#if defined(GEMMLOWP_ARM) && defined(__APPLE__)
// iPhone/iPad
const int kDefaultL1CacheSize = 48 * 1024;
const int kDefaultL2CacheSize = 2 * 1024 * 1024;
#elif defined(GEMMLOWP_ARM) || defined(GEMMLOWP_ANDROID)
// Other ARM or ARM-like hardware (Android implies ARM-like) so here it's OK
// to tune for ARM, although on x86 Atom we might be able to query
// cache sizes at runtime, which would be better.
const int kDefaultL1CacheSize = 16 * 1024;
const int kDefaultL2CacheSize = 384 * 1024;
#elif defined(GEMMLOWP_X86_64)
// x86-64 and not Android. Therefore, likely desktop-class x86 hardware.
// Thus we assume larger cache sizes, though we really should query
// them at runtime.
const int kDefaultL1CacheSize = 32 * 1024;
const int kDefaultL2CacheSize = 4 * 1024 * 1024;
#elif defined(GEMMLOWP_X86_32)
// x86-32 and not Android. Same as x86-64 but less bullish.
const int kDefaultL1CacheSize = 32 * 1024;
const int kDefaultL2CacheSize = 2 * 1024 * 1024;
#elif defined(GEMMLOWP_MIPS)
// MIPS and not Android. TODO: MIPS and Android?
const int kDefaultL1CacheSize = 32 * 1024;
const int kDefaultL2CacheSize = 1024 * 1024;
#else
// Less common hardware. Maybe some unusual or older or embedded thing.
// Assume smaller caches, but don't depart too far from what we do
// on ARM/Android to avoid accidentally exposing unexpected behavior.
const int kDefaultL1CacheSize = 16 * 1024;
const int kDefaultL2CacheSize = 256 * 1024;
#endif

// The proportion of the cache that we intend to use for storing
// RHS blocks. This should be between 0 and 1, and typically closer to 1,
// as we typically want to use most of the L2 cache for storing a large
// RHS block.
#if defined(GEMMLOWP_X86)
// For IA, use the entire L2 cache for the RHS matrix. LHS matrix is not blocked
// for L2 cache.
const float kDefaultL2RhsFactor = 1.00f;
#else
const float kDefaultL2RhsFactor = 0.75f;
#endif

// The number of bytes in a SIMD register. This is used to determine
// the dimensions of PackingRegisterBlock so that such blocks can
// be efficiently loaded into registers, so that packing code can
// work within registers as much as possible.
// In the non-SIMD generic fallback code, this is just a generic array
// size, so any size would work there. Different platforms may set this
// to different values but must ensure that their own optimized packing paths
// are consistent with this value.

#ifdef GEMMLOWP_AVX2
const int kRegisterSize = 32;
#else
const int kRegisterSize = 16;
#endif

// Hints the CPU to prefetch the cache line containing ptr.
inline void Prefetch(const void* ptr) {
#if defined GEMMLOWP_ARM_64 && defined GEMMLOWP_ALLOW_INLINE_ASM
  // Aarch64 has very detailed prefetch instructions, that compilers
  // can't know how to map __builtin_prefetch to, and as a result, don't,
  // leaving __builtin_prefetch a no-op on this architecture.
  // For our purposes, "pldl1keep" is usually what we want, meaning:
  // "prefetch for load, into L1 cache, using each value multiple times".
  asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
#elif defined \
    __GNUC__  // Clang and GCC define __GNUC__ and have __builtin_prefetch.
  __builtin_prefetch(ptr);
#else
  (void)ptr;
#endif
}

// Returns the runtime argument rounded down to the nearest multiple of
// the fixed Modulus.
template <unsigned Modulus, typename Integer>
Integer RoundDown(Integer i) {
  return i - (i % Modulus);
}

// Returns the runtime argument rounded up to the nearest multiple of
// the fixed Modulus.
template <unsigned Modulus, typename Integer>
Integer RoundUp(Integer i) {
  return RoundDown<Modulus>(i + Modulus - 1);
}

// Returns the quotient a / b rounded up ('ceil') to the nearest integer.
template <typename Integer>
Integer CeilQuotient(Integer a, Integer b) {
  return (a + b - 1) / b;
}

// Returns the argument rounded up to the nearest power of two.
template <typename Integer>
Integer RoundUpToPowerOfTwo(Integer n) {
  Integer i = n - 1;
  i |= i >> 1;
  i |= i >> 2;
  i |= i >> 4;
  i |= i >> 8;
  i |= i >> 16;
  return i + 1;
}

template <int N>
struct IsPowerOfTwo {
  static constexpr bool value = !(N & (N - 1));
};

template <typename T>
void MarkMemoryAsInitialized(T* ptr, int size) {
#ifdef GEMMLOWP_MARK_MEMORY_AS_INITIALIZED
  GEMMLOWP_MARK_MEMORY_AS_INITIALIZED(static_cast<void*>(ptr),
                                      size * sizeof(T));
#else
  (void)ptr;
  (void)size;
#endif
}

}  // namespace gemmlowp

#endif  // GEMMLOWP_INTERNAL_COMMON_H_