Moving Data
cudaMalloc(&d_data, size);
cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice);
kernel<<<blocks, threads>>>(d_data);
kernel2<<<blocks, threads>>>(d_data);
^ This is Faster?
cudaMalloc(&d_data, size);
cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice);
kernel<<<blocks, threads>>>(d_data);
cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost);
cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice);
kernel2<<<blocks, threads>>>(d_data);
^ This is Faster?
* For illustration purposes only, see FAQ for more details.