Store-To-Load Forwarding

struct Elem {
  int32_t part1{0};
  int32_t part2{0};

  void Store(int32_t x) {
    part1 = 0;
    part2 = x;
  }

  int64_t Load() {
    return *reinterpret_cast<int64_t*>(this);
  }
};

volatile uint16_t x;

for (auto &elem : arr) {
  elem.Store(x);
  std::atomic_thread_fence(std::memory_order_seq_cst);
  sum += elem.Load();
}

struct Elem {
  int32_t part1{0};
  int32_t part2{0};

  void Store(int32_t x) {
    *reinterpret_cast<int64_t*>(this) = x;
  }

  int64_t Load() {
    return *reinterpret_cast<int64_t*>(this);
  }
};

volatile uint16_t x;

for (auto &elem : arr) {
  elem.Store(x);
  std::atomic_thread_fence(std::memory_order_seq_cst);
  sum += elem.Load();
}

* The benchmark is run under AMD Ryzen 9.

* For the full benchmark code, please refer here.

* For illustration purpose only, see FAQ for more details.