CUDA 并行加法示例：向量加法

#include <cuda_runtime.h> #include <device_launch_parameters.h>

#include <stdio.h>

cudaError_t addWithCuda(int* c, const int* a, const int* b, unsigned int size);

global void addKernel(int* c, const int* a, const int* b) { int i = threadIdx.x; c[i] = a[i] + b[i]; }

int main() { const int arraySize = 5; const int a[arraySize] = { 1, 2, 3, 4, 5 }; const int b[arraySize] = { 10, 20, 30, 40, 50 }; int c[arraySize] = { 0 };

// 并行添加向量。
cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "addWithCuda failed!\n");
    return 1;
}

printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
    c[0], c[1], c[2], c[3], c[4]);

// 为了使 Nsight 和 Visual Profiler 等分析和跟踪工具显示完整的跟踪，在退出之前必须调用 cudaDeviceReset。
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaDeviceReset failed!\n");
    return 1;
}

return 0;

}

// 使用 CUDA 并行添加向量的辅助函数。 cudaError_t addWithCuda(int* c, const int* a, const int* b, unsigned int size) { int* dev_a = 0; int* dev_b = 0; int* dev_c = 0; cudaError_t cudaStatus;

// 选择要运行的 GPU，在多 GPU 系统上更改此设置。
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?\n");
    goto Error;
}

// 为三个向量（两个输入，一个输出）分配 GPU 缓冲区。
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!\n");
    goto Error;
}

cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!\n");
    goto Error;
}

cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!\n");
    goto Error;
}

// 将输入向量从主机内存复制到 GPU 缓冲区。
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!\n");
    goto Error;
}

cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!\n");
    goto Error;
}

// 在 GPU 上启动一个内核，每个元素使用一个线程。
addKernel<<<1, size>>>(dev_c, dev_a, dev_b);

// 检查启动内核时出现的任何错误。
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
    goto Error;
}

// cudaDeviceSynchronize 等待内核完成，并返回启动过程中遇到的任何错误。
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
    goto Error;
}

// 将输出向量从 GPU 缓冲区复制回主机内存。
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!\n");
    goto Error;
}

Error: cudaFree(dev_c); cudaFree(dev_a); cudaFree(dev_b);

return cudaStatus;