Mutex

std::vector<std::thread> threads;
std::atomic<int> sum = 0;

for (int i = 0; i < 4; ++i) {
  threads.emplace_back([&arr, i, &sum]() {
    int partial_sum = 0;
    auto start = i * arr.size() / 4;
    auto end = (i + 1) * arr.size() / 4;
    for (int i = start; i < end; ++i) {
      partial_sum += arr[i];
    }

    sum += partial_sum;
  });
}
std::vector<std::thread> threads;
std::atomic<int> sum = 0;

for (int i = 0; i < 4; ++i) {
  threads.emplace_back([&arr, i, &sum]() {
    auto start = i * arr.size() / 4;
    auto end = (i + 1) * arr.size() / 4;
    for (int i = start; i < end; ++i) {
      sum += arr[i];
    }
  });
}

* The benchmark is run under AMD Ryzen 9.

* For the full benchmark code, please refer here.