Store-To-Load Forwarding
struct Elem {
int32_t part1{0};
int32_t part2{0};
void Store(int32_t x) {
part1 = 0;
part2 = x;
}
int64_t Load() {
return *reinterpret_cast<int64_t*>(this);
}
};
volatile uint16_t x;
for (auto &elem : arr) {
elem.Store(x);
std::atomic_thread_fence(std::memory_order_seq_cst);
sum += elem.Load();
}
struct Elem {
int32_t part1{0};
int32_t part2{0};
void Store(int32_t x) {
*reinterpret_cast<int64_t*>(this) = x;
}
int64_t Load() {
return *reinterpret_cast<int64_t*>(this);
}
};
volatile uint16_t x;
for (auto &elem : arr) {
elem.Store(x);
std::atomic_thread_fence(std::memory_order_seq_cst);
sum += elem.Load();
}
* The benchmark is run under AMD Ryzen 9.
* For the full benchmark code, please refer here.