Kernel Fission
__global__ void compute(float* out, float* in, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
out[idx] = heavy_compute(in[idx]);
}
__global__ void scatter(float* dst, float* src, int* map, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
dst[map[idx]] = src[idx];
}
^ This is Faster?
__global__ void combined(float* dst, float* in, int* map, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
dst[map[idx]] = heavy_compute(in[idx]);
}
^ This is Faster?
* For illustration purposes only, see FAQ for more details.