NVCC use CUDA C/C++ source code and allows developers to write high-performance GPU-accelerated applications by leveraging the power of NVIDIA GPUs for parallel processing tasks.
Today I test some simple examples with this tool on Google Colab using the nvcc4jupyter python package.
You need to install it with the pip and know how to use the CUDA C/C++ source code, or use the basic example from documentation.
pip install nvcc4jupyter
I change some source code because is need to install this library and I don't have time to learn and test.
But this will allow me to test better, because on my desktop I don't have a good hardware.
This is the source I change and I cut the source code linked on error_handling.h.
This is the changed source code , you can see more on my GitHub repo for Google Colab ! !
#include
//#include "error_handling.h"
const int DSIZE = 4096;
const int block_size = 256;
// vector add kernel: C = A + B
__global__ void vadd(const float *A, const float *B, float *C, int ds){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < ds) {
C[idx] = A[idx] + B[idx];
}
}
int main(){
float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C;
// allocate space for vectors in host memory
h_A = new float[DSIZE];
h_B = new float[DSIZE];
h_C = new float[DSIZE];
// initialize vectors in host memory to random values (except for the
// result vector whose values do not matter as they will be overwritten)
for (int i = 0; i < DSIZE; i++) {
h_A[i] = rand()/(float)RAND_MAX;
h_B[i] = rand()/(float)RAND_MAX;
}
// allocate space for vectors in device memory
cudaMalloc(&d_A, DSIZE*sizeof(float));
cudaMalloc(&d_B, DSIZE*sizeof(float));
cudaMalloc(&d_C, DSIZE*sizeof(float));
//cudaCheckErrors("cudaMalloc failure"); // error checking
// copy vectors A and B from host to device:
cudaMemcpy(d_A, h_A, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
//cudaCheckErrors("cudaMemcpy H2D failure");
// launch the vector adding kernel
vadd<<<(DSIZE+block_size-1)/block_size, block_size>>>(d_A, d_B, d_C, DSIZE);
//cudaCheckErrors("kernel launch failure");
// wait for the kernel to finish execution
cudaDeviceSynchronize();
//cudaCheckErrors("kernel execution failure");
cudaMemcpy(h_C, d_C, DSIZE*sizeof(float), cudaMemcpyDeviceToHost);
//cudaCheckErrors("cudaMemcpy D2H failure");
printf("A[0] = %f\n", h_A[0]);
printf("B[0] = %f\n", h_B[0]);
printf("C[0] = %f\n", h_C[0]);
return 0;
}
This is result ...
A[0] = 0.840188
B[0] = 0.394383
C[0] = 0.000000