大家好,我是极智视界。本文主要聊一下 GPU 并行推理的几个方式。
CUDA流 表示一个 GPU 操作队列,该队列中的操作将以添加到流中的先后顺序而依次执行。可以将一个流看做是GPU 上的一个任务,不同任务可以并行执行。使用 CUDA流,首先要选择一个支持设备重叠(Device Overlap)功能的设备,支持设备重叠功能的 GPU 能够在执行一个 CUDA 核函数的同时,还能在主机和设备之间执行复制数据操作。
支持重叠功能的设备的这一特性很重要,可以在一定程度上提升 GPU 程序的执行效率。一般情况下,CPU 内存远大于 GPU 内存,对于数据量比较大的情况,不可能把 CPU 缓冲区中的数据一次性传输给 GPU,需要分块传输,如果能够在分块传输的同时,GPU 也在执行核函数运算,这样就形成了异步操作,能够提高极大提升运算性能。
下面实际介绍几种 GPU 并行的方式。
1 Cuda 核函数并行
void privateBlobFromImagesGpu(const float* imageDatas, int batchCount, int width, int height, const float* mean, const float* std, float* blob, cudaStream_t stream) { const dim3 block(3, width); const dim3 grid(height, batchCount); meanAndStdAndSplit <<<grid, block, 0, stream >>> (blob, imageDatas, mean, std); } global static void meanAndStdAndSplit(float* blob, const float* imageDatas, const float* mean, const float* std) { const int c = threadIdx.x; const int x = threadIdx.y; const int y = blockIdx.x; const int idx = blockIdx.y; const unsigned int blobIdx = idx * blockDim.x * blockDim.y * gridDim.x + c * blockDim.y * gridDim.x + y * blockDim.y + x; const unsigned int imagesIdx = idx * blockDim.x * blockDim.y * gridDim.x + y * blockDim.x * blockDim.y + x * blockDim.x + c; blob[blobIdx] = (imageDatas[imagesIdx] - mean[c]) / std[c]; } for (int i = 0; i < thNB; i++) { privateBlobFromImagesGpu((float*)m_converArray, imgdata.size.size(), m_inputW, m_inputH, m_mean_GPU, m_std_GPU, (float*)m_Bindings.at(m_InputIndex), cudaStream[i]); } for (int i = 0; i < thNB; i++) { cudaStreamSynchronize(cudaStream); }
2 调用英伟达 API 库并行
for (int i = 0; i < imgdata.size.size(); i++) { cv::cuda::GpuMat gpuRgbSrcImg(cv::Size(imgdata.size[i].w, imgdata.size[i].h), CV_8UC3, (cv::uint8_t*)imgdata.data + i * imgdata.size[i].w * imgdata.size[i].h); cv::cuda::GpuMat gpuRgbDstImg(cv::Size(m_inputW, m_inputH), CV_8UC3, (cv::uint8_t*)m_resizeArray + i * m_inputC * m_inputH * m_inputW); cv::cuda::resize(gpuRgbSrcImg, gpuRgbDstImg, cv::Size(m_inputW, m_inputH), 0.0, 0.0, cv::INTER_LINEAR); cv::cuda::GpuMat dst_conver(outputSize, CV_32FC3, (float*)m_converArray + i * m_inputC * m_inputH * m_inputW); gpuRgbDstImg.convertTo(dst_conver, CV_32F, 1.0 / 255, 0); } stream.waitForCompletion();
3 TRT 并行
int testStream() { int outNB = 0; std::string model_path = "./data/"; int batchsize = 4; int streamNB = 2; DoInference *doInfer_stream1 = new DoInference(); std::vector<int> outputSize; bool isInit1 = doInfer_stream1->InitModle(model_path, OD, TensorRT, outputSize, streamNB, batchsize); std::vector<float*> inputData; std::vector<cudaStream_t> cudaStream; std::vector<vector<void*>>imgdata_stream; inputData.resize(streamNB); cudaStream.resize(streamNB); int* size = new int[4]; size[0] = 512 * 512 * 3; size[1] = 135168; size[2] = 33792; size[3] = 8848; imgdata_stream.resize(streamNB); for (int i = 0; i < streamNB; i++) { cudaStreamCreate(&cudaStream.at(i)); cudaStreamCreateWithFlags(&cudaStream.at(i), cudaStreamNonBlocking); cudaMallocHost(&inputData.at(i), batchsize * size[0] * sizeof(float)); imgdata_stream.at(i).resize(4); for (int j = 0; j < 4; j++) { cudaMalloc(&imgdata_stream.at(i).at(j), batchsize * size[j] * sizeof(float)); } for (int z = 0; z < size[0]; z++) { inputData.at(i)[z] = z; } } for (int count = 0; count < 5; count++) { for (int i = 0; i < streamNB; i++) { cudaMemcpyAsync(imgdata_stream.at(i).at(0), inputData.at(i), batchsize * size[0] * sizeof(float), cudaMemcpyHostToDevice, cudaStream.at(i)); doInfer_stream1->DoinferTestStream(imgdata_stream.at(i), batchsize, i, cudaStream.at(i)); } } for (int i = 0; i < streamNB; i++) { cudaStreamSynchronize(cudaStream.at(i)); } //cudaStreamSynchronize(cudaStream1); //cudaStreamSynchronize(cudaStream2); return 0; }
好了,以上聊了下 GPU 并行推理的几个方式,希望我的分享能对你的学习有一点帮助。