#include <stdio.h> #include <stdlib.h> #include <cuda_runtime.h>

#define N 1024

// CUDA kernel for matrix multiplication global void matrixMul(float *a, float *b, float *c, int n) { int i = blockIdx.x * blockDim.x + threadIdx.x; int j = blockIdx.y * blockDim.y + threadIdx.y;

if (i < n && j < n)
{
    float sum = 0.0f;
    for (int k = 0; k < n; k++)
    {
        sum += a[i * n + k] * b[k * n + j];
    }
    c[i * n + j] = sum;
}

}

int main() { float *a, *b, *c; // host matrices float *d_a, *d_b, *d_c; // device matrices

// Allocate memory for host matrices
a = (float *)malloc(N * N * sizeof(float));
b = (float *)malloc(N * N * sizeof(float));
c = (float *)malloc(N * N * sizeof(float));

// Initialize host matrices with random values
for (int i = 0; i < N; i++)
{
    for (int j = 0; j < N; j++)
    {
        a[i * N + j] = (float)rand() / RAND_MAX;
        b[i * N + j] = (float)rand() / RAND_MAX;
    }
}

// Allocate memory for device matrices
cudaMalloc((void **)&d_a, N * N * sizeof(float));
cudaMalloc((void **)&d_b, N * N * sizeof(float));
cudaMalloc((void **)&d_c, N * N * sizeof(float));

// Copy host matrices to device
cudaMemcpy(d_a, a, N * N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, N * N * sizeof(float), cudaMemcpyHostToDevice);

// Define block size and grid size
dim3 block_size(16, 16);
dim3 grid_size((N + block_size.x - 1) / block_size.x, (N + block_size.y - 1) / block_size.y);

// Call CUDA kernel for matrix multiplication
matrixMul<<<grid_size, block_size>>>(d_a, d_b, d_c, N);

// Copy result from device to host
cudaMemcpy(c, d_c, N * N * sizeof(float), cudaMemcpyDeviceToHost);

// Verify the result
for (int i = 0; i < N; i++)
{
    for (int j = 0; j < N; j++)
    {
        float sum = 0.0f;
        for (int k = 0; k < N; k++)
        {
            sum += a[i * N + k] * b[k * N + j];
        }
        if (abs(c[i * N + j] - sum) > 1e-5)
        {
            printf("Error: Result mismatch at (%d, %d), expected %f but got %f\n", i, j, sum, c[i * N + j]);
            return 1;
        }
    }
}

printf("Matrix multiplication successful!\n");

// Free memory
free(a);
free(b);
free(c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);

return 0;

}

CUDA矩阵乘法并行化代码示例

原文地址: https://www.cveoy.top/t/topic/nwg4 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录