使用cuda的矩阵相乘并行化代码使用共享存储器优化代码
以下是使用CUDA进行矩阵乘法并使用共享存储器进行优化的代码。在这个实现中,我们首先将输入矩阵从全局内存中复制到共享内存中,然后使用线程块中的线程计算结果矩阵的一部分。最后,我们将结果从共享内存写回全局内存。
#include <stdio.h>
#define TILE_WIDTH 16
#define BLOCK_SIZE 16
__global__ void matrixMultiplyShared(float *A, float *B, float *C, int n)
{
__shared__ float s_A[TILE_WIDTH][TILE_WIDTH];
__shared__ float s_B[TILE_WIDTH][TILE_WIDTH];
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int Row = by * TILE_WIDTH + ty;
int Col = bx * TILE_WIDTH + tx;
float Cvalue = 0.0;
for (int k = 0; k < n / TILE_WIDTH; k++) {
s_A[ty][tx] = A[Row * n + k * TILE_WIDTH + tx];
s_B[ty][tx] = B[(k * TILE_WIDTH + ty) * n + Col];
__syncthreads();
for (int i = 0; i < TILE_WIDTH; i++) {
Cvalue += s_A[ty][i] * s_B[i][tx];
}
__syncthreads();
}
C[Row * n + Col] = Cvalue;
}
int main()
{
float *h_A, *h_B, *h_C;
float *d_A, *d_B, *d_C;
int n = 1024;
size_t size = n * n * sizeof(float);
h_A = (float *)malloc(size);
h_B = (float *)malloc(size);
h_C = (float *)malloc(size);
cudaMalloc((void **)&d_A, size);
cudaMalloc((void **)&d_B, size);
cudaMalloc((void **)&d_C, size);
for (int i = 0; i < n * n; i++) {
h_A[i] = i % 10;
h_B[i] = i % 10;
}
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(n / TILE_WIDTH, n / TILE_WIDTH);
matrixMultiplyShared<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, n);
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
for (int i = 0; i < n * n; i++) {
if (h_C[i] != i % 10 * n * n) {
printf("Error at position %d\n", i);
return -1;
}
}
printf("Success!\n");
free(h_A);
free(h_B);
free(h_C);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
return 0;
}
``
原文地址: https://www.cveoy.top/t/topic/cvfw 著作权归作者所有。请勿转载和采集!