1 #include<stdio.h>
2 #include<cuda_runtime.h>
3
4 //__global__声明的函数,告诉编译器这段代码交由CPU调用,由GPU执行
5 __global__ void mul(int *dev_a,const int NUM)
6 {
7 int idx = blockIdx.x * blockDim.x + threadIdx.x;
8 int dis=blockDim.x * gridDim.x;
9 while(idx<NUM)
10 {
11 dev_a[idx]=dev_a[idx]%23*dev_a[idx]*5%9;
12 idx+=dis;
13 }
14 }
15
16 int main(void)
17 {
18 const int thread_pre_block = 64; //每个block的线程数量
19 const int block_pre_grid = 8; //grid中的block数量
20 const int NUM = 45056;
21
22 //申请主机内存,并进行初始化
23 int host_a[NUM];
24 for(int i=0;i<NUM;i++)
25 host_a[i]=i;
26
27 //定义cudaError,默认为cudaSuccess(0)
28 cudaError_t err = cudaSuccess;
29
30 //申请GPU存储空间
31 int *dev_a;
32 err=cudaMalloc((void **)&dev_a, sizeof(int)*NUM);
33 if(err!=cudaSuccess)
34 {
35 perror("the cudaMalloc on GPU is failed");
36 return 1;
37 }
38
39 //将要计算的数据使用cudaMemcpy传送到GPU
40 cudaMemcpy(dev_a,host_a,sizeof(host_a),cudaMemcpyHostToDevice);
41
42 dim3 threads = dim3(thread_pre_block);
43 dim3 blocks = dim3(block_pre_grid);
44
45 //使用event计算时间
46 float time_elapsed=0;
47 cudaEvent_t start,stop;
48 cudaEventCreate(&start); //创建Event
49 cudaEventCreate(&stop);
50
51 cudaEventRecord( start,0); //记录当前时间
52 mul<<<blocks, threads, 0, 0>>>(dev_a,NUM);
53 cudaEventRecord( stop,0); //记录当前时间
54
55 cudaEventSynchronize(start); //Waits for an event to complete.
56 cudaEventSynchronize(stop); //Waits for an event to complete.Record之前的任务
57 cudaEventElapsedTime(&time_elapsed,start,stop); //计算时间差
58
59 cudaMemcpy(&host_a,dev_a,sizeof(host_a),cudaMemcpyDeviceToHost); //计算结果回传到CPU
60
61 cudaEventDestroy(start); //destory the event
62 cudaEventDestroy(stop);
63 cudaFree(dev_a);//释放GPU内存
64 printf("执行时间:%f(ms)\n",time_elapsed);
65 return 0 ;
66 }