Kernel Fusion
__global__ void fused(float* data, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
float x = data[idx];
x = x * 2.0f + 1.0f; // op1
x = sqrtf(x); // op2
data[idx] = x;
}
^ This is Faster?
__global__ void op1(float* data, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
data[idx] = data[idx] * 2.0f + 1.0f;
}
__global__ void op2(float* data, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
data[idx] = sqrtf(data[idx]);
}
^ This is Faster?
* For illustration purposes only, see FAQ for more details.