将cuBLAS库的乘法运算进行了封装,方便了算法调用;
将原文的结果转置实现为了不转置,这样可以直接使用计算结果;
测试并更改了乘法参数,解决了原文中更改矩阵大小时报错的问题;
将int换成了long,缓解了矩阵过大时的越界问题;
将单精度cublasSgemm换成了cublasDgemm,缓解了矩阵元素数值过大时精度的丢失问题;
总的来说,本博客的代码利用cuBLAS库实现了两个矩阵相乘,提高了矩阵乘法的计算速度。
test.cpp
#include "cuda_runtime.h" #include "cublas_v2.h" #include <time.h> #include <iostream> using namespace std; // cuBLAS实现矩阵乘法 long **matMult_cuBLAS(long **A, long **B, long rowSizeA, long colSizeA, long colSizeB, cublasHandle_t cuHandle){ // 1.定义结果矩阵 long** C = new long*[rowSizeA]; for(long i = 0; i < rowSizeA; i++) C[i] = new long[colSizeB]; // 2.在内存中为将要计算的矩阵开辟空间 double* h_A = (double*)malloc(rowSizeA * colSizeA * sizeof(double)); double* h_B = (double*)malloc(colSizeA * colSizeB * sizeof(double)); double* h_C = (double*)malloc(rowSizeA * colSizeB * sizeof(double)); // 3.初始化计算矩阵h_A和h_B for (long i = 0; i < rowSizeA; i++) for (long j = 0; j < colSizeA; j++) h_A[i * colSizeA + j] = (double)A[i][j]; for (long i = 0; i < colSizeA; i++) for (long j = 0; j < colSizeB; j++) h_B[i * colSizeB + j] = (double)B[i][j]; // 4.在显存中为将要计算矩阵与结果矩阵开辟空间 double* d_A, * d_B, * d_C; cudaMalloc((void**)&d_A, rowSizeA * colSizeA * sizeof(double)); cudaMalloc((void**)&d_B, colSizeA * colSizeB * sizeof(double)); cudaMalloc((void**)&d_C, rowSizeA * colSizeB * sizeof(double)); // 5.将CPU数据拷贝到GPU上 cublasSetVector(rowSizeA * colSizeA, sizeof(double), h_A, 1, d_A, 1); cublasSetVector(colSizeA * colSizeB, sizeof(double), h_B, 1, d_B, 1); // 6.传递进矩阵相乘函数中的参数,具体含义请参考函数手册.并执行核函数,矩阵相乘 double a = 1; double b = 0; cublasDgemm(cuHandle, CUBLAS_OP_T, CUBLAS_OP_T, rowSizeA, colSizeB, colSizeA, &a, d_A, colSizeA, d_B, colSizeB, &b, d_C, rowSizeA); // 7.从GPU中取出运算结果至CPU中去 cublasGetVector(rowSizeA * colSizeB, sizeof(double), d_C, 1, h_C, 1); // 8.将结果赋值给结果矩阵 for (long i = 0; i < rowSizeA; i++) for (long j=0; j<colSizeB; j++) C[i][j] = static_cast<long>(h_C[j * rowSizeA + i]); // 9.清理掉使用过的内存 free(h_A); free(h_B); free(h_C); cudaFree(d_A);cudaFree(d_B); cudaFree(d_C); return C; } // 构造一个随机二维数组(矩阵) long** uniformMat(long rowSize, long colSize, long minValue, long maxValue) { long** mat = new long* [rowSize]; for (long i = 0; i < rowSize; i++) mat[i] = new long[colSize]; // srand(1024); srand((unsigned)time(NULL)); //随机数种子采用系统时钟 for (long i = 0; i < rowSize; i++) { for (long j = 0; j < colSize; j++) { mat[i][j] = (long)(rand() % (maxValue - minValue + 1)) + minValue; } } return mat; } int main(void) { // 创建并初始化 CUBLAS 库对象 // 若是CUBLAS对象在主函数中初始化,cuBLAS方法在其他函数中调用,需要将cuHandle传入该函数,并在该函数内创建status对象 cublasHandle_t cuHandle; cublasStatus_t status = cublasCreate(&cuHandle); if (status != CUBLAS_STATUS_SUCCESS) { if (status == CUBLAS_STATUS_NOT_INITIALIZED) { cout << "CUBLAS 对象实例化出错" << endl; } getchar (); return EXIT_FAILURE; } // 矩阵大小定义 long rowSizeA = 3; // 矩阵A的行数 long colSizeA = 4; // 矩阵A的列数和矩阵B的行数 long colSizeB = 2; // 矩阵B的列数 // 构造一个3行4列的矩阵A,矩阵元素在(0,4)内随机选取 long **A = uniformMat(rowSizeA, colSizeA, 0, 4); // 构造一个4行2列的矩阵B,矩阵元素在(5,9)内随机选取 long **B = uniformMat(colSizeA, colSizeB, 5, 9); // 输出矩阵A和B cout << "矩阵 A :" << endl; for (long i = 0; i < rowSizeA; i++) { for (long j = 0; j < colSizeA; j++) { cout << A[i][j] << " "; } cout << endl; } cout << endl; cout << "矩阵 B :" << endl; for (long i = 0; i < colSizeA; i++) { for (long j = 0; j < colSizeB; j++) { cout << B[i][j] << " "; } cout << endl; } cout << endl; // 使用cuBLAS进行矩阵乘法运算:C = A * B long **C = matMult_cuBLAS(A, B, rowSizeA, colSizeA, colSizeB, cuHandle); // 输出矩阵C,即运算结果 cout << "矩阵 C :" << endl; for (long i = 0; i < rowSizeA; i++) { for (long j = 0; j < colSizeB; j++) { cout << C[i][j] << " "; } cout << endl; } cout << endl; // 释放 CUBLAS 库对象 cublasDestroy (cuHandle); return 0; }
在终端输入:
nvcc -lcublas test.cpp -o t ./t 1 2 运算结果: 矩阵 A : 1 3 2 0 2 1 2 1 4 3 2 4 矩阵 B : 6 8 7 5 7 6 7 6 矩阵 C : 41 35 40 39 87 83