Prefetch

constexpr int STRIDE = 4096;

for (auto i = 0u; i < arr.size(); ++i) {
  arr[i] = (i + STRIDE) % arr.size();
}

# Measure below part only
int sum{0};
int p{0};
for (auto i = 0u; i < arr.size(); ++i) {
  __builtin_prefetch(&arr[(p + 1 * STRIDE) % arr.size()], 0, 0);
  __builtin_prefetch(&arr[(p + 2 * STRIDE) % arr.size()], 0, 0);

  sum += arr[p];
  p = arr[p];
}
constexpr int STRIDE = 4096;

for (auto i = 0u; i < arr.size(); ++i) {
  arr[i] = (i + STRIDE) % arr.size();
}

# Measure below part only
int sum{0};
int p{0};
for (auto i = 0u; i < arr.size(); ++i) {
  sum += arr[p];
  p = arr[p];
}