summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndy Hung <hunga@google.com>2024-03-12 19:14:37 +0000
committerGerrit Code Review <noreply-gerritcodereview@google.com>2024-03-12 19:14:37 +0000
commit9c8832d72d384474061d39b604b91cd8e6135444 (patch)
tree31a7d114fc56bdd024fab2b85dbcd79662e606b9
parent8f6208583277d35a6870db3266f70cbae9815010 (diff)
parent21d7c296eb7381bbbd65489d96844486341f07b6 (diff)
downloadmedia-9c8832d72d384474061d39b604b91cd8e6135444.tar.gz
Merge changes Iebad14ab,I662e8789 into main
* changes: audio mutex: Refine metadata visibility audio mutex: Optimize data access
-rw-r--r--audio_utils/benchmarks/audio_mutex_benchmark.cpp171
-rw-r--r--audio_utils/include/audio_utils/mutex.h177
2 files changed, 266 insertions, 82 deletions
diff --git a/audio_utils/benchmarks/audio_mutex_benchmark.cpp b/audio_utils/benchmarks/audio_mutex_benchmark.cpp
index a58aec31..216e8a99 100644
--- a/audio_utils/benchmarks/audio_mutex_benchmark.cpp
+++ b/audio_utils/benchmarks/audio_mutex_benchmark.cpp
@@ -34,79 +34,84 @@ $ atest audio_mutex_benchmark
Benchmark Time CPU Iteration
audio_mutex_benchmark:
- #BM_atomic_add_equals<int32_t> 6.508700118072382 ns 6.471633177192451 ns 108110486
- #BM_atomic_add_to_seq_cst<int16_t> 6.557658152513349 ns 6.526665108542128 ns 107252873
- #BM_atomic_add_to_seq_cst<int32_t> 6.61304199453549 ns 6.58175539524565 ns 106351923
- #BM_atomic_add_to_seq_cst<int64_t> 6.557521711571485 ns 6.5265363568644625 ns 107250668
- #BM_atomic_add_to_seq_cst<float> 7.895243222524512 ns 7.858297243207844 ns 89394951
- #BM_atomic_add_to_seq_cst<double> 7.931688495474578 ns 7.893971885098797 ns 88653486
- #BM_atomic_add_to_relaxed<int16_t> 5.140386288993005 ns 5.116383769230237 ns 135131188
- #BM_atomic_add_to_relaxed<int32_t> 5.181670175781189 ns 5.157418005923224 ns 135724804
- #BM_atomic_add_to_relaxed<int64_t> 5.161260548149761 ns 5.136776648952849 ns 135135216
- #BM_atomic_add_to_relaxed<float> 7.786417198158838 ns 7.749791796134465 ns 90646732
- #BM_atomic_add_to_relaxed<double> 7.760358404716961 ns 7.723992286938152 ns 90644677
- #BM_gettid 2.116039491081284 ns 2.106033253650779 ns 332358395
- #BM_systemTime 43.074033150581585 ns 42.8699911242381 ns 16328739
- #BM_thread_8_variables 2.8214796173366734 ns 2.8081271094521703 ns 249273547
- #BM_thread_local_8_variables 2.819987500327649 ns 2.808149311074747 ns 249278495
- #BM_StdMutexLockUnlock 18.155770972784783 ns 18.070903999828232 ns 38747264
- #BM_RWMutexReadLockUnlock 16.12456214871892 ns 16.04901684644192 ns 43612414
- #BM_RWMutexWriteLockUnlock 19.14824893658628 ns 19.05893391346091 ns 36725255
- #BM_SharedMutexReadLockUnlock 39.54155074347332 ns 39.35497456828369 ns 17788418
- #BM_SharedMutexWriteLockUnlock 41.58785205766037 ns 41.39323040198865 ns 16911078
- #BM_AudioUtilsMutexLockUnlock 66.56918230215399 ns 66.25544975244046 ns 10562911
- #BM_AudioUtilsPIMutexLockUnlock 67.02589961630612 ns 66.70819768056897 ns 10493090
- #BM_StdMutexInitializationLockUnlock 29.544903877103074 ns 29.406544528057406 ns 23801319
- #BM_RWMutexInitializationReadLockUnlock 26.91749522594829 ns 26.802654591541785 ns 26123567
- #BM_RWMutexInitializationWriteLockUnlock 30.20599678894913 ns 30.06422812747118 ns 23284596
- #BM_SharedMutexInitializationReadLockUnlock 58.070478136125395 ns 57.79511704041489 ns 12111671
- #BM_SharedMutexInitializationWriteLockUnlock 59.36722820827075 ns 59.08875400469678 ns 11843905
- #BM_AudioUtilsMutexInitializationLockUnlock 85.04952357479699 ns 84.65093492146583 ns 8269839
- #BM_AudioUtilsPIMutexInitializationLockUnlock 83.32953114993384 ns 82.9411400506946 ns 8440765
- #BM_StdMutexBlockingConditionVariable/threads:2 20067.186478012434 ns 25402.779402102544 ns 54792
- #BM_AudioUtilsMutexBlockingConditionVariable/threads:2 48417.40553370931 ns 58220.13591731267 ns 23220
- #BM_AudioUtilsPIMutexBlockingConditionVariable/threads:2 48724.90563264992 ns 59858.82489342454 ns 15482
- #BM_StdMutexScopedLockUnlock/threads:1 33.58821991644139 ns 33.41913176098606 ns 16058919
- #BM_StdMutexScopedLockUnlock/threads:2 356.67886764843007 ns 707.8318856903202 ns 4625680
- #BM_StdMutexScopedLockUnlock/threads:4 130.45108549886208 ns 447.1268742499998 ns 4000000
- #BM_StdMutexScopedLockUnlock/threads:8 139.0823761208755 ns 541.9088026721488 ns 1362200
- #BM_RWMutexScopedReadLockUnlock/threads:1 32.33613871803748 ns 32.194204614295046 ns 21710272
- #BM_RWMutexScopedReadLockUnlock/threads:2 160.47792160732033 ns 319.3012639397403 ns 2095986
- #BM_RWMutexScopedReadLockUnlock/threads:4 217.21087383931467 ns 861.2673855686197 ns 839892
- #BM_RWMutexScopedReadLockUnlock/threads:8 232.19586516883186 ns 1831.4409709220026 ns 491368
- #BM_RWMutexScopedWriteLockUnlock/threads:1 33.49908180449042 ns 33.34195684310611 ns 21010780
- #BM_RWMutexScopedWriteLockUnlock/threads:2 286.096410842338 ns 564.599202114389 ns 2485068
- #BM_RWMutexScopedWriteLockUnlock/threads:4 451.7913123512162 ns 1601.6332793492106 ns 1931432
- #BM_RWMutexScopedWriteLockUnlock/threads:8 417.50240217790537 ns 1678.8585405353656 ns 794072
- #BM_SharedMutexScopedReadLockUnlock/threads:1 67.65354544884363 ns 67.37498338520537 ns 9133426
- #BM_SharedMutexScopedReadLockUnlock/threads:2 370.22816132765433 ns 735.4710534035784 ns 1322608
- #BM_SharedMutexScopedReadLockUnlock/threads:4 298.7991937078523 ns 1015.8674764877635 ns 991824
- #BM_SharedMutexScopedReadLockUnlock/threads:8 359.17200914091643 ns 1500.1318202480697 ns 615960
- #BM_SharedMutexScopedWriteLockUnlock/threads:1 73.40224842642553 ns 73.06218848168656 ns 8616869
- #BM_SharedMutexScopedWriteLockUnlock/threads:2 502.8427941278981 ns 909.1756670594543 ns 599122
- #BM_SharedMutexScopedWriteLockUnlock/threads:4 2322.7325028106275 ns 6083.585590040707 ns 313436
- #BM_SharedMutexScopedWriteLockUnlock/threads:8 4948.555700826256 ns 15412.772486815033 ns 373152
- #BM_AudioUtilsMutexScopedLockUnlock/threads:1 147.60580533538862 ns 146.97151308638587 ns 4062848
- #BM_AudioUtilsMutexScopedLockUnlock/threads:2 5409.319112352385 ns 10729.084861761592 ns 728090
- #BM_AudioUtilsMutexScopedLockUnlock/threads:4 630.9403610213494 ns 1866.9171243841429 ns 579688
- #BM_AudioUtilsMutexScopedLockUnlock/threads:8 612.9153996947896 ns 2167.0654441098654 ns 417104
- #BM_AudioUtilsPIMutexScopedLockUnlock/threads:1 148.94249680999073 ns 148.3061023465011 ns 4387722
- #BM_AudioUtilsPIMutexScopedLockUnlock/threads:2 3537.898640072271 ns 4287.604650248743 ns 356196
- #BM_AudioUtilsPIMutexScopedLockUnlock/threads:4 13969.834843789307 ns 19572.29615170118 ns 28688
- #BM_AudioUtilsPIMutexScopedLockUnlock/threads:8 30652.264078729862 ns 40000.50360617244 ns 23848
- #BM_StdMutexReverseScopedLockUnlock/threads:1 31.34740304135938 ns 31.200396418488175 ns 21854682
- #BM_StdMutexReverseScopedLockUnlock/threads:2 54.06016658620641 ns 103.2554157873692 ns 5317694
- #BM_StdMutexReverseScopedLockUnlock/threads:4 169.8661622311813 ns 592.4042833246494 ns 3209096
- #BM_StdMutexReverseScopedLockUnlock/threads:8 156.65913206788008 ns 604.623918327717 ns 1742672
- #BM_AudioUtilsMutexReverseScopedLockUnlock/threads:1 147.51456839840807 ns 146.73295356311675 ns 4395816
- #BM_AudioUtilsMutexReverseScopedLockUnlock/threads:2 2425.8992549948744 ns 4812.346055000001 ns 200000
- #BM_AudioUtilsMutexReverseScopedLockUnlock/threads:4 453.8639331349259 ns 1256.0567649999934 ns 400000
- #BM_AudioUtilsMutexReverseScopedLockUnlock/threads:8 635.5625220561735 ns 2294.725433768965 ns 356872
- #BM_AudioUtilsPIMutexReverseScopedLockUnlock/threads:1 148.7079480412097 ns 148.0359150267745 ns 4188943
- #BM_AudioUtilsPIMutexReverseScopedLockUnlock/threads:2 14037.435207752424 ns 17829.977469499998 ns 2000000
- #BM_AudioUtilsPIMutexReverseScopedLockUnlock/threads:4 20098.127750043204 ns 26126.68207500001 ns 40000
- #BM_AudioUtilsPIMutexReverseScopedLockUnlock/threads:8 28805.264783022852 ns 38780.66452074406 ns 16776
- #BM_empty_while 0.352701456999057 ns 0.35104016500000057 ns 1000000000
+ #BM_atomic_add_equals<int32_t> 6.490418995069907 ns 6.4717376481357896 ns 108158957
+ #BM_atomic_add_to_seq_cst<int16_t> 6.5491215252883315 ns 6.528080020397028 ns 107223462
+ #BM_atomic_add_to_seq_cst<int32_t> 6.6043085052910895 ns 6.58277029519354 ns 106339895
+ #BM_atomic_add_to_seq_cst<int64_t> 6.547130657683545 ns 6.527710835284538 ns 107241300
+ #BM_atomic_add_to_seq_cst<float> 7.886748664549371 ns 7.8608841749954355 ns 89038641
+ #BM_atomic_add_to_seq_cst<double> 7.935562180977917 ns 7.910317678915859 ns 88467659
+ #BM_atomic_add_to_relaxed<int16_t> 5.144169800499881 ns 5.127361036326905 ns 137048029
+ #BM_atomic_add_to_relaxed<int32_t> 5.168846899607784 ns 5.15352435703851 ns 135676634
+ #BM_atomic_add_to_relaxed<int64_t> 5.156732436798179 ns 5.141098657660172 ns 136413594
+ #BM_atomic_add_to_relaxed<float> 7.763552575883229 ns 7.740082913372091 ns 90622053
+ #BM_atomic_add_to_relaxed<double> 7.760723400931919 ns 7.734912038002161 ns 90618849
+ #BM_atomic_add_to_unordered<int16_t> 0.3533747960000255 ns 0.3520660080000013 ns 1000000000
+ #BM_atomic_add_to_unordered<int32_t> 0.3534268799999154 ns 0.352105915000001 ns 1000000000
+ #BM_atomic_add_to_unordered<int64_t> 0.35323697900003026 ns 0.35204362999999894 ns 1000000000
+ #BM_atomic_add_to_unordered<float> 0.7043684281594664 ns 0.7021647364411598 ns 988700453
+ #BM_atomic_add_to_unordered<double> 0.704450251294842 ns 0.7021509129361374 ns 996984194
+ #BM_gettid 2.1128518266669247 ns 2.106438363375118 ns 332324191
+ #BM_systemTime 43.50481860259643 ns 43.373881730104316 ns 16213103
+ #BM_thread_8_variables 2.8174664322723015 ns 2.8088585587229447 ns 249169901
+ #BM_thread_local_8_variables 2.8176008559183345 ns 2.8088503601840658 ns 249223823
+ #BM_StdMutexLockUnlock 20.51083054083145 ns 20.372680732323307 ns 34365643
+ #BM_RWMutexReadLockUnlock 17.182708241218037 ns 17.10085471231418 ns 40872349
+ #BM_RWMutexWriteLockUnlock 20.01395996116509 ns 19.912307155808747 ns 35403322
+ #BM_SharedMutexReadLockUnlock 39.34289759177089 ns 39.214633183208534 ns 17848256
+ #BM_SharedMutexWriteLockUnlock 42.42260135644499 ns 42.25185684039555 ns 16568333
+ #BM_AudioUtilsMutexLockUnlock 32.59607274485956 ns 32.48981289778165 ns 21549504
+ #BM_AudioUtilsPIMutexLockUnlock 33.79847568429067 ns 33.6964229192697 ns 20775880
+ #BM_StdMutexInitializationLockUnlock 30.36758342133683 ns 30.254101880283233 ns 23141887
+ #BM_RWMutexInitializationReadLockUnlock 27.28375660870322 ns 27.196487591970985 ns 25738012
+ #BM_RWMutexInitializationWriteLockUnlock 30.20724264266599 ns 30.096879362472333 ns 23256594
+ #BM_SharedMutexInitializationReadLockUnlock 57.43815201585343 ns 57.27383821891096 ns 12222096
+ #BM_SharedMutexInitializationWriteLockUnlock 59.42673061824289 ns 59.25235124842115 ns 11814362
+ #BM_AudioUtilsMutexInitializationLockUnlock 46.10038716918369 ns 45.953988368973455 ns 15233909
+ #BM_AudioUtilsPIMutexInitializationLockUnlock 50.73553222492994 ns 50.574418947890834 ns 13835334
+ #BM_StdMutexBlockingConditionVariable/threads:2 11523.72384534072 ns 12714.605659025783 ns 58632
+ #BM_AudioUtilsMutexBlockingConditionVariable/threads:2 9338.361496790618 ns 11206.032771535578 ns 74760
+ #BM_AudioUtilsPIMutexBlockingConditionVariable/threads:2 12430.610334229705 ns 13459.017326162135 ns 52060
+ #BM_StdMutexScopedLockUnlock/threads:1 33.534067204276546 ns 33.40309483152711 ns 20796027
+ #BM_StdMutexScopedLockUnlock/threads:2 269.1759952499524 ns 533.950398499998 ns 2000000
+ #BM_StdMutexScopedLockUnlock/threads:4 90.18870335515196 ns 271.3231852294451 ns 2269488
+ #BM_StdMutexScopedLockUnlock/threads:8 121.03213508602038 ns 451.7371193384729 ns 2448632
+ #BM_RWMutexScopedReadLockUnlock/threads:1 32.11047130691962 ns 31.96092065619549 ns 21757351
+ #BM_RWMutexScopedReadLockUnlock/threads:2 117.73928731993787 ns 230.31362984633367 ns 2348992
+ #BM_RWMutexScopedReadLockUnlock/threads:4 220.8538545474783 ns 858.6430804361402 ns 949424
+ #BM_RWMutexScopedReadLockUnlock/threads:8 217.2344705376624 ns 1528.7949547499559 ns 460552
+ #BM_RWMutexScopedWriteLockUnlock/threads:1 34.76444514474894 ns 34.665961723712094 ns 20194069
+ #BM_RWMutexScopedWriteLockUnlock/threads:2 303.41208949994325 ns 603.2115715000019 ns 2000000
+ #BM_RWMutexScopedWriteLockUnlock/threads:4 298.4931931843524 ns 916.926215593706 ns 1571660
+ #BM_RWMutexScopedWriteLockUnlock/threads:8 432.74492906249407 ns 1240.5567937500045 ns 800000
+ #BM_SharedMutexScopedReadLockUnlock/threads:1 70.04048550107358 ns 69.8046640694218 ns 9059342
+ #BM_SharedMutexScopedReadLockUnlock/threads:2 357.07506909046754 ns 709.4210754541601 ns 1482834
+ #BM_SharedMutexScopedReadLockUnlock/threads:4 336.03568074383156 ns 1087.821794679974 ns 989168
+ #BM_SharedMutexScopedReadLockUnlock/threads:8 343.4415500594684 ns 1423.0045686060148 ns 870944
+ #BM_SharedMutexScopedWriteLockUnlock/threads:1 77.31578352815413 ns 77.00259046212362 ns 8135228
+ #BM_SharedMutexScopedWriteLockUnlock/threads:2 356.1377498778198 ns 627.7192368534169 ns 1218796
+ #BM_SharedMutexScopedWriteLockUnlock/threads:4 2206.5972784481546 ns 5390.78073569482 ns 770700
+ #BM_SharedMutexScopedWriteLockUnlock/threads:8 2643.145098618517 ns 7265.627503497389 ns 1012184
+ #BM_AudioUtilsMutexScopedLockUnlock/threads:1 68.37942831761342 ns 68.16332511845363 ns 8684647
+ #BM_AudioUtilsMutexScopedLockUnlock/threads:2 439.5642884199026 ns 868.7699421475584 ns 1605118
+ #BM_AudioUtilsMutexScopedLockUnlock/threads:4 321.1245397453114 ns 1025.1737506853917 ns 2203128
+ #BM_AudioUtilsMutexScopedLockUnlock/threads:8 302.42947515758783 ns 1176.8521985370544 ns 1262112
+ #BM_AudioUtilsPIMutexScopedLockUnlock/threads:1 69.87225800700081 ns 69.64552224576019 ns 8994051
+ #BM_AudioUtilsPIMutexScopedLockUnlock/threads:2 4420.777346513025 ns 5456.967229338184 ns 265756
+ #BM_AudioUtilsPIMutexScopedLockUnlock/threads:4 1506.8638396645179 ns 1927.406805542472 ns 424360
+ #BM_AudioUtilsPIMutexScopedLockUnlock/threads:8 25030.96209476646 ns 27871.63623561846 ns 33376
+ #BM_StdMutexReverseScopedLockUnlock/threads:1 33.47593087477488 ns 33.37508010876382 ns 20550186
+ #BM_StdMutexReverseScopedLockUnlock/threads:2 198.84388250011398 ns 385.92393400000117 ns 2000000
+ #BM_StdMutexReverseScopedLockUnlock/threads:4 93.50488264641875 ns 276.2069913615782 ns 3951648
+ #BM_StdMutexReverseScopedLockUnlock/threads:8 110.50842131360572 ns 378.4212902611287 ns 2141768
+ #BM_AudioUtilsMutexReverseScopedLockUnlock/threads:1 68.2132503060489 ns 68.01976601705918 ns 9013905
+ #BM_AudioUtilsMutexReverseScopedLockUnlock/threads:2 223.03285165273516 ns 424.2072166440236 ns 1879738
+ #BM_AudioUtilsMutexReverseScopedLockUnlock/threads:4 264.11614886066064 ns 743.2390829429721 ns 1815416
+ #BM_AudioUtilsMutexReverseScopedLockUnlock/threads:8 274.5291393750193 ns 1015.9050412499973 ns 800000
+ #BM_AudioUtilsPIMutexReverseScopedLockUnlock/threads:1 69.50784383771779 ns 69.31317329009033 ns 8408894
+ #BM_AudioUtilsPIMutexReverseScopedLockUnlock/threads:2 631.9418303245776 ns 790.5849174679049 ns 1417388
+ #BM_AudioUtilsPIMutexReverseScopedLockUnlock/threads:4 12829.762531245593 ns 15402.261100000063 ns 40000
+ #BM_AudioUtilsPIMutexReverseScopedLockUnlock/threads:8 24954.85928430851 ns 26994.147940851126 ns 28944
+ #BM_empty_while 0.3522347409998474 ns 0.35108219199999985 ns 1000000000
*/
@@ -165,6 +170,26 @@ BENCHMARK(BM_atomic_add_to_relaxed<float>);
BENCHMARK(BM_atomic_add_to_relaxed<double>);
+template <typename T>
+static void BM_atomic_add_to_unordered(benchmark::State &state) {
+ int64_t i64 = 10;
+ android::audio_utils::unordered_atomic<T> dst;
+ while (state.KeepRunning()) {
+ android::audio_utils::atomic_add_to(dst, i64, std::memory_order_relaxed);
+ }
+ LOG(DEBUG) << __func__ << " " << dst.load();
+}
+
+BENCHMARK(BM_atomic_add_to_unordered<int16_t>);
+
+BENCHMARK(BM_atomic_add_to_unordered<int32_t>);
+
+BENCHMARK(BM_atomic_add_to_unordered<int64_t>);
+
+BENCHMARK(BM_atomic_add_to_unordered<float>);
+
+BENCHMARK(BM_atomic_add_to_unordered<double>);
+
// Benchmark gettid(). The mutex class uses this to get the linux thread id.
static void BM_gettid(benchmark::State &state) {
int32_t value = 0;
diff --git a/audio_utils/include/audio_utils/mutex.h b/audio_utils/include/audio_utils/mutex.h
index a473b611..aafdab45 100644
--- a/audio_utils/include/audio_utils/mutex.h
+++ b/audio_utils/include/audio_utils/mutex.h
@@ -389,6 +389,132 @@ public:
static constexpr bool abort_on_invalid_unlock_ = true;
};
+// relaxed_atomic implements the same features as std::atomic<T> but using
+// std::memory_order_relaxed as default.
+//
+// This is the minimum consistency for the multiple writer multiple reader case.
+
+template <typename T>
+class relaxed_atomic : private std::atomic<T> {
+public:
+ constexpr relaxed_atomic(T desired) : std::atomic<T>(desired) {}
+ operator T() const { return std::atomic<T>::load(std::memory_order_relaxed); }
+ T operator=(T desired) {
+ std::atomic<T>::store(desired, std::memory_order_relaxed); return desired;
+ }
+
+ T operator--() { return std::atomic<T>::fetch_sub(1, std::memory_order_relaxed) - 1; }
+ T operator++() { return std::atomic<T>::fetch_add(1, std::memory_order_relaxed) + 1; }
+ T operator+=(const T value) {
+ return std::atomic<T>::fetch_add(value, std::memory_order_relaxed) + value;
+ }
+
+ T load(std::memory_order order = std::memory_order_relaxed) const {
+ return std::atomic<T>::load(order);
+ }
+ T fetch_add(T arg, std::memory_order order =std::memory_order_relaxed) {
+ return std::atomic<T>::fetch_add(arg, order);
+ }
+ bool compare_exchange_weak(
+ T& expected, T desired, std::memory_order order = std::memory_order_relaxed) {
+ return std::atomic<T>::compare_exchange_weak(expected, desired, order);
+ }
+};
+
+// unordered_atomic implements data storage such that memory reads have a value
+// consistent with a memory write in some order, i.e. not having values
+// "out of thin air".
+//
+// Unordered memory reads and writes may not actually take place but be implicitly cached.
+// Nevertheless, a memory read should return at least as contemporaneous a value
+// as the last memory write before the write thread memory barrier that
+// preceded the most recent read thread memory barrier.
+//
+// This is weaker than relaxed_atomic and has no equivalent C++ terminology.
+// unordered_atomic would be used for a single writer, multiple reader case,
+// where data access of type T would be a implemented by the compiler and
+// hw architecture with a single "uninterruptible" memory operation.
+// (The current implementation holds true for general realized CPU architectures).
+// Note that multiple writers would cause read-modify-write unordered_atomic
+// operations to have inconsistent results.
+//
+// unordered_atomic is implemented with normal operations such that compiler
+// optimizations can take place which would otherwise be discouraged for atomics.
+// https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2016/p0062r1.html
+
+template <typename T>
+class unordered_atomic {
+ static_assert(std::atomic<T>::is_always_lock_free);
+public:
+ unordered_atomic() = default;
+ constexpr unordered_atomic(T desired) : t_(desired) {}
+ operator T() const { return t_; }
+ T& operator=(T desired) { return t_ = desired; }
+
+ T& operator--() { const T temp = t_ - 1; return t_ = temp; }
+ T& operator++() { const T temp = t_ + 1; return t_ = temp; }
+ T& operator+=(const T value) { const T temp = t_ + value; return t_ = temp; }
+
+ T load(std::memory_order order = std::memory_order_relaxed) const { (void)order; return t_; }
+
+private:
+ T t_;
+};
+
+// While std::atomic with the default std::memory_order_seq_cst
+// access could be used, it results in performance loss over less
+// restrictive memory access.
+
+// stats_atomic is a multiple writer multiple reader object.
+//
+// This is normally used to increment statistics counters on
+// mutex priority categories.
+//
+// We used relaxed_atomic instead of std::atomic/memory_order_seq_cst here.
+template <typename T>
+using stats_atomic = relaxed_atomic<T>;
+
+// thread_atomic is a single writer multiple reader object.
+//
+// This is normally accessed as a thread local (hence single writer)
+// but may be accessed (rarely) by multiple readers on deadlock
+// detection which does not modify the data.
+//
+// We use unordered_atomic instead of std::atomic/memory_order_seq_cst here.
+template <typename T>
+using thread_atomic = unordered_atomic<T>;
+
+inline void compiler_memory_barrier() {
+ // Reads or writes are not migrated or cached by the compiler across this barrier.
+ asm volatile("" ::: "memory");
+
+ // if not using gnu / clang, compare with compiler-only barrier generated by
+ // std::atomic_signal_fence(std::memory_order_seq_cst);
+ // https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0124r7.html
+}
+
+// The mutex locking is thread-safe.
+//
+// However, the mutex metadata (statistics and thread info) updates are not locked
+// by an internal mutex for efficiency reasons. Instead, they use atomics, with
+// the possibility of false negatives since they are not sampled synchronously.
+//
+// To prevent the compiler from excessively caching the statistics and thread metadata
+// which makes this asynchronous atomic sampling worse, as unordered or relaxed atomics
+// do not implicitly impose any memory barriers,
+// we can elect to explicitly issue compiler memory barriers to ensure
+// metadata visibility across threads. This is optional, and only useful if
+// the compiler does aggressive inlining.
+//
+inline void metadata_memory_barrier_if_needed() {
+ // check the level of atomicity used for thread metadata to alter the
+ // use of a barrier here.
+ if constexpr (std::is_same_v<thread_atomic<int32_t>, unordered_atomic<int32_t>>
+ || std::is_same_v<thread_atomic<int32_t>, relaxed_atomic<int32_t>>) {
+ compiler_memory_barrier();
+ }
+}
+
/**
* Helper method to accumulate floating point values to an atomic
* prior to C++23 support of atomic<float> atomic<double> accumulation.
@@ -411,6 +537,30 @@ void atomic_add_to(std::atomic<AccumulateType> &dst, ValueType src,
dst.fetch_add(src, order);
}
+template <typename AccumulateType, typename ValueType>
+requires std::is_floating_point<AccumulateType>::value
+void atomic_add_to(relaxed_atomic<AccumulateType> &dst, ValueType src,
+ std::memory_order order = std::memory_order_relaxed) {
+ AccumulateType expected;
+ do {
+ expected = dst;
+ } while (!dst.compare_exchange_weak(expected, expected + src, order));
+}
+
+template <typename AccumulateType, typename ValueType>
+requires std::is_integral<AccumulateType>::value
+void atomic_add_to(relaxed_atomic<AccumulateType> &dst, ValueType src,
+ std::memory_order order = std::memory_order_relaxed) {
+ dst.fetch_add(src, order);
+}
+
+template <typename AccumulateType, typename ValueType>
+void atomic_add_to(unordered_atomic<AccumulateType> &dst, ValueType src,
+ std::memory_order order = std::memory_order_relaxed) {
+ (void)order; // unused
+ dst = dst + src;
+}
+
/**
* mutex_stat is a struct composed of atomic members associated
* with usage of a particular mutex order.
@@ -427,11 +577,11 @@ struct mutex_stat {
static_assert(std::is_integral_v<CounterType>);
static_assert(std::atomic<CounterType>::is_always_lock_free);
static_assert(std::atomic<AccumulatorType>::is_always_lock_free);
- std::atomic<CounterType> locks = 0; // number of times locked
- std::atomic<CounterType> unlocks = 0; // number of times unlocked
- std::atomic<CounterType> waits = 0; // number of locks that waitedwa
- std::atomic<AccumulatorType> wait_sum_ns = 0.; // sum of time waited.
- std::atomic<AccumulatorType> wait_sumsq_ns = 0.; // sumsq of time waited.
+ stats_atomic<CounterType> locks = 0; // number of times locked
+ stats_atomic<CounterType> unlocks = 0; // number of times unlocked
+ stats_atomic<CounterType> waits = 0; // number of locks that waited
+ stats_atomic<AccumulatorType> wait_sum_ns = 0.; // sum of time waited.
+ stats_atomic<AccumulatorType> wait_sumsq_ns = 0.; // sumsq of time waited.
template <typename WaitTimeType>
void add_wait_time(WaitTimeType wait_ns) {
@@ -524,7 +674,7 @@ struct mutex_stat {
template <typename Item, typename Payload, size_t N>
class atomic_stack {
public:
- using item_payload_pair_t = std::pair<std::atomic<Item>, std::atomic<Payload>>;
+ using item_payload_pair_t = std::pair<thread_atomic<Item>, thread_atomic<Payload>>;
/**
* Puts the item at the top of the stack.
@@ -648,8 +798,8 @@ public:
const auto& invalid() const { return invalid_; }
private:
- std::atomic<size_t> top_ = 0; // ranges from 0 to N - 1
- std::atomic<size_t> true_top_ = 0; // always >= top_.
+ thread_atomic<size_t> top_ = 0; // ranges from 0 to N - 1
+ thread_atomic<size_t> true_top_ = 0; // always >= top_.
// if true_top_ == top_ the subset stack is complete.
/*
@@ -761,7 +911,7 @@ public:
}
const pid_t tid_; // me
- std::atomic<MutexHandle> mutex_wait_{}; // mutex waiting for
+ thread_atomic<MutexHandle> mutex_wait_{}; // mutex waiting for
atomic_stack_t mutexes_held_; // mutexes held
};
@@ -1080,11 +1230,13 @@ public:
m_.lock();
}
lock_scoped_stat_t::post_lock(*this);
+ metadata_memory_barrier_if_needed();
}
void unlock() RELEASE() {
lock_scoped_stat_t::pre_unlock(*this);
m_.unlock();
+ metadata_memory_barrier_if_needed();
}
bool try_lock(int64_t timeout_ns = 0) TRY_ACQUIRE(true) {
@@ -1100,10 +1252,12 @@ public:
lock_scoped_stat_t ls(*this);
if (pthread_mutex_timedlock(m_.native_handle(), &ts) != 0) {
ls.ignoreWaitTime(); // didn't get lock, don't count wait time
+ metadata_memory_barrier_if_needed();
return false;
}
}
lock_scoped_stat_t::post_lock(*this);
+ metadata_memory_barrier_if_needed();
return true;
}
@@ -1376,12 +1530,14 @@ public:
}
mutex::lock_scoped_stat_t::post_lock(mutex_);
held = true;
+ metadata_memory_barrier_if_needed();
}
void unlock() RELEASE() {
mutex::lock_scoped_stat_t::pre_unlock(mutex_);
held = false;
ul_.unlock();
+ metadata_memory_barrier_if_needed();
}
bool try_lock() TRY_ACQUIRE(true) {
@@ -1389,6 +1545,7 @@ public:
if (!ul_.try_lock()) return false;
mutex::lock_scoped_stat_t::post_lock(mutex_);
held = true;
+ metadata_memory_barrier_if_needed();
return true;
}
@@ -1399,6 +1556,7 @@ public:
if (!ul_.try_lock_for(timeout_duration)) return false;
mutex::lock_scoped_stat_t::post_lock(mutex_);
held = true;
+ metadata_memory_barrier_if_needed();
return true;
}
@@ -1409,6 +1567,7 @@ public:
if (!ul_.try_lock_until(timeout_time)) return false;
mutex::lock_scoped_stat_t::post_lock(mutex_);
held = true;
+ metadata_memory_barrier_if_needed();
return true;
}