Kernel Fission

__global__ void compute(float* out, float* in, int N) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  out[idx] = heavy_compute(in[idx]);
}
__global__ void scatter(float* dst, float* src, int* map, int N) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  dst[map[idx]] = src[idx];
}
^ This is Faster?
__global__ void combined(float* dst, float* in, int* map, int N) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  dst[map[idx]] = heavy_compute(in[idx]);
}
^ This is Faster?

* For illustration purposes only, see FAQ for more details.