CUDA 并行加法示例:向量加法
#include <cuda_runtime.h> #include <device_launch_parameters.h>
#include <stdio.h>
cudaError_t addWithCuda(int* c, const int* a, const int* b, unsigned int size);
global void addKernel(int* c, const int* a, const int* b) { int i = threadIdx.x; c[i] = a[i] + b[i]; }
int main() { const int arraySize = 5; const int a[arraySize] = { 1, 2, 3, 4, 5 }; const int b[arraySize] = { 10, 20, 30, 40, 50 }; int c[arraySize] = { 0 };
// 并行添加向量。
cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!\n");
return 1;
}
printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
c[0], c[1], c[2], c[3], c[4]);
// 为了使 Nsight 和 Visual Profiler 等分析和跟踪工具显示完整的跟踪,在退出之前必须调用 cudaDeviceReset。
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!\n");
return 1;
}
return 0;
}
// 使用 CUDA 并行添加向量的辅助函数。 cudaError_t addWithCuda(int* c, const int* a, const int* b, unsigned int size) { int* dev_a = 0; int* dev_b = 0; int* dev_c = 0; cudaError_t cudaStatus;
// 选择要运行的 GPU,在多 GPU 系统上更改此设置。
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?\n");
goto Error;
}
// 为三个向量(两个输入,一个输出)分配 GPU 缓冲区。
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!\n");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!\n");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!\n");
goto Error;
}
// 将输入向量从主机内存复制到 GPU 缓冲区。
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!\n");
goto Error;
}
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!\n");
goto Error;
}
// 在 GPU 上启动一个内核,每个元素使用一个线程。
addKernel<<<1, size>>>(dev_c, dev_a, dev_b);
// 检查启动内核时出现的任何错误。
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize 等待内核完成,并返回启动过程中遇到的任何错误。
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
// 将输出向量从 GPU 缓冲区复制回主机内存。
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!\n");
goto Error;
}
Error: cudaFree(dev_c); cudaFree(dev_a); cudaFree(dev_b);
return cudaStatus;
原文地址: https://www.cveoy.top/t/topic/l0QB 著作权归作者所有。请勿转载和采集!