Kernel Fusion

__global__ void fused(float* data, int N) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  float x = data[idx];
  x = x * 2.0f + 1.0f;   // op1
  x = sqrtf(x);           // op2
  data[idx] = x;
}
^ This is Faster?
__global__ void op1(float* data, int N) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  data[idx] = data[idx] * 2.0f + 1.0f;
}
__global__ void op2(float* data, int N) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  data[idx] = sqrtf(data[idx]);
}
^ This is Faster?

* For illustration purposes only, see FAQ for more details.