5TensorRT部署
5.1 目标检测常见的落地形式
1、TensorRT是什么
TensorRT是推理优化器,能对训练好的模型进行优化。可以理解为只有前向传播的深度学习框架,这个框架可以将Caffe,TensorFlow的网络模型解析,然后与TensorRT中对应的层进行一一映射,把其他框架的模型统一全部转换到TensorRT中,然后在TensorRT中可以针对NVIDIA自家GPU实施优化策略,并进行部署加速。当你的网络训练完之后,可以将训练模型文件直接丢进TensorRT中,而不再需要依赖深度学习框架(Caffe,TensorFlow等)。
2、本文AI部署流程
先把onnx转化为TensorRT的Engine文件,然后让c++环境下的TensorRT直接加载Engine文件,从而构建engine,本文主要讲解onnx转换至Engine,然后进行基于TensorRT的C++推理检测。
转换和部署模型5个基本步骤:
- step1:获取模型
- step2:选择batchsize
- step3:选择精度
- step4:模型转换
- step5:模型部署
5.2 ONNX-TensorRT的部署流程
1、ONNX转化为TRT Engine
# 导出onnx文件 python export.py ---weights weights/v5lite-g.pt --batch-size 1 --imgsz 640 --include onnx --simplify # 使用TensorRT官方的trtexec工具将onnx文件转换为engine trtexec --explicitBatch --onnx=./v5lite-g.onnx --saveEngine=v5lite-g.trt --fp16
闲话不多说,这里已经拿到了trt的engine,那么如何进行推理呢?总的来说,分为3步:
- 首先load你的engine,拿到一个ICudaEngine, 这个是TensorRT推理的核心;
- 定位模型的输入和输出,有几个输入有几个输出;
- forward模型,然后拿到输出,对输出进行后处理。
当然这里最核心的东西其实就两个,一个是如何导入拿到CudaEngine,第二个是比较麻烦的后处理。
2、加载TRT Engine
bool Model::readTrtFile() { std::string cached_engine; std::fstream file; std::cout << "loading filename from:" << engine_file << std::endl; nvinfer1::IRuntime *trtRuntime; file.open(engine_file, std::ios::binary | std::ios::in); if (!file.is_open()) { std::cout << "read file error: " << engine_file << std::endl; cached_engine = ""; } while (file.peek() != EOF) { std::stringstream buffer; buffer << file.rdbuf(); cached_engine.append(buffer.str()); } file.close(); trtRuntime = nvinfer1::createInferRuntime(gLogger.getTRTLogger()); engine = trtRuntime->deserializeCudaEngine(cached_engine.data(), cached_engine.size(), nullptr); std::cout << "deserialize done" << std::endl; } // 加载 TensorRT Engine void v5Lite::LoadEngine() { // create and load engine std::fstream existEngine; existEngine.open(engine_file, std::ios::in); // 如果存在已经转换完成的TensorRT Engine文件,则直接加载 if (existEngine) { readTrtFile(engine_file, engine); assert(engine != nullptr); } // 如果不存在已经转换完成的TensorRT Engine文件,则直接加载ONNX权重进行在线生成 else { onnxToTRTModel(onnx_file, engine_file, engine, BATCH_SIZE); assert(engine != nullptr); } }
3、后处理之坐标转换
YOLOv5的坐标转换方法
std::vector<std::vector<V5lite::DetectRes>> V5lite::postProcess(const std::vector<cv::Mat> &vec_Mat, float *output, const int &outSize) { std::vector<std::vector<DetectRes>> vec_result; int index = 0; for (const cv::Mat &src_img : vec_Mat) { std::vector<DetectRes> result; float ratio = float(src_img.cols) / float(IMAGE_WIDTH) > float(src_img.rows) / float(IMAGE_HEIGHT) ? float(src_img.cols) / float(IMAGE_WIDTH) : float(src_img.rows) / float(IMAGE_HEIGHT); float *out = output + index * outSize; int position = 0; for (int n = 0; n < (int)grids.size(); n++) { for (int c = 0; c < grids[n][0]; c++) { std::vector<int> anchor = anchors[n * grids[n][0] + c]; for (int h = 0; h < grids[n][1]; h++) for (int w = 0; w < grids[n][2]; w++) { float *row = out + position * (CATEGORY + 5); position++; DetectRes box; auto max_pos = std::max_element(row + 5, row + CATEGORY + 5); box.prob = row[4] * row[max_pos - row]; if (box.prob < obj_threshold) continue; box.classes = max_pos - row - 5; // 坐标的反参数化,和前文的坐标转换对接 box.x = (row[0] * 2 - 0.5 + w) / grids[n][2] * IMAGE_WIDTH * ratio; box.y = (row[1] * 2 - 0.5 + h) / grids[n][1] * IMAGE_HEIGHT * ratio; box.w = pow(row[2] * 2, 2) * anchor[0] * ratio; box.h = pow(row[3] * 2, 2) * anchor[1] * ratio; result.push_back(box); } } } NmsDetect(result); vec_result.push_back(result); index++; } return vec_result; }
4、进行模型推理
// 推理整个文件夹的文件 bool YOLOv5::InferenceFolder(const std::string &folder_name) { // 读取文件夹下面的文件,并返回为一个 string vector 迭代器 std::vector<std::string> sample_images = readFolder(folder_name); //get context assert(engine != nullptr); // 创建上下文,创建一些空间来存储中间值。一个engine可以创建多个context,分别执行多个推理任务。 context = engine->createExecutionContext(); assert(context != nullptr); // 传递给Engine的输入输出buffers指针,这里对应一个输入和一个输出 assert(engine->getNbBindings() == 2); void *buffers[2]; std::vector<int64_t> bufferSize; int nbBindings = engine->getNbBindings(); bufferSize.resize(nbBindings); for (int i = 0; i < nbBindings; ++i) { // 获取输入或输出的维度信息 nvinfer1::Dims dims = engine->getBindingDimensions(i); // 获取输入或输出的数据类型信息 nvinfer1::DataType dtype = engine->getBindingDataType(i); int64_t totalSize = volume(dims) * 1 * getElementSize(dtype); bufferSize[i] = totalSize; std::cout << "binding" << i << ": " << totalSize << std::endl; // &buffers是双重指针 相当于改变指针本身,这里就是把输入或输出进行向量化操作 cudaMalloc(&buffers[i], totalSize); } //get stream cudaStream_t stream; // 创建 Stream cudaStreamCreate(&stream); int outSize = bufferSize[1] / sizeof(float) / BATCH_SIZE; // 执行推理 EngineInference(sample_images, outSize, buffers, bufferSize, stream); // 释放 stream 和 buffers cudaStreamDestroy(stream); cudaFree(buffers[0]); cudaFree(buffers[1]); // destroy the engine context->destroy(); engine->destroy(); } void YOLOv5::EngineInference(const std::vector<std::string> &image_list, const int &outSize, void **buffers, const std::vector<int64_t> &bufferSize, cudaStream_t stream) { int index = 0; int batch_id = 0; std::vector<cv::Mat> vec_Mat(BATCH_SIZE); std::vector<std::string> vec_name(BATCH_SIZE); float total_time = 0; // 遍历图像路径list for (const std::string &image_name : image_list) { index++; std::cout << "Processing: " << image_name << std::endl; // 读取图像内容到cv_mat cv::Mat src_img = cv::imread(image_name); // 把图像和图像名分别保存在vec_Mat和vec_name之中 if (src_img.data) { vec_Mat[batch_id] = src_img.clone(); vec_name[batch_id] = image_name; batch_id++; } if (batch_id == BATCH_SIZE or index == image_list.size()) { // 声明时间戳 t_start_pre auto t_start_pre = std::chrono::high_resolution_clock::now(); std::cout << "prepareImage" << std::endl; std::vector<float>curInput = prepareImage(vec_Mat); auto t_end_pre = std::chrono::high_resolution_clock::now(); // 至此,prepare Image的时间已经计算完成 float total_pre = std::chrono::duration<float, std::milli>(t_end_pre - t_start_pre).count(); std::cout << "prepare image take: " << total_pre << " ms." << std::endl; total_time += total_pre; batch_id = 0; if (!curInput.data()) { std::cout << "prepare images ERROR!" << std::endl; continue; } // 将数据从CPU端传送到GPU端 std::cout << "host2device" << std::endl; cudaMemcpyAsync(buffers[0], curInput.data(), bufferSize[0], cudaMemcpyHostToDevice, stream); // 执行推理 std::cout << "execute" << std::endl; auto t_start = std::chrono::high_resolution_clock::now(); context->execute(BATCH_SIZE, buffers); auto t_end = std::chrono::high_resolution_clock::now(); float total_inf = std::chrono::duration<float, std::milli>(t_end - t_start).count(); std::cout << "Inference take: " << total_inf << " ms." << std::endl; total_time += total_inf; std::cout << "execute success" << std::endl; std::cout << "device2host" << std::endl; std::cout << "post process" << std::endl; auto r_start = std::chrono::high_resolution_clock::now(); auto *out = new float[outSize * BATCH_SIZE]; // Copy GPU端的推理结果到CPU端 cudaMemcpyAsync(out, buffers[1], bufferSize[1], cudaMemcpyDeviceToHost, stream); // 阻塞当前程序的执行,直到所有任务都处理完毕,这样可以将计算和主机与设备之前的传输并行化,提高效率。 cudaStreamSynchronize(stream); // 进行后处理操作 auto boxes = postProcess(vec_Mat, out, outSize); auto r_end = std::chrono::high_resolution_clock::now(); float total_res = std::chrono::duration<float, std::milli>(r_end - r_start).count(); std::cout << "Post process take: " << total_res << " ms." << std::endl; total_time += total_res; for (int i = 0; i < (int)vec_Mat.size(); i++) { auto org_img = vec_Mat[i]; if (!org_img.data) continue; auto rects = boxes[i]; for(const auto &rect : rects) { char t[256]; sprintf(t, "%.2f", rect.prob); std::string name = coco_labels[rect.classes] + "-" + t; // 图书添加文字 cv::putText(org_img, name, cv::Point(rect.x - rect.w / 2, rect.y - rect.h / 2 - 5), cv::FONT_HERSHEY_COMPLEX, 0.7, class_colors[rect.classes], 2); // 绘制矩形框 cv::Rect rst(rect.x - rect.w / 2, rect.y - rect.h / 2, rect.w, rect.h); cv::rectangle(org_img, rst, class_colors[rect.classes], 2, cv::LINE_8, 0); } int pos = vec_name[i].find_last_of("."); std::string rst_name = vec_name[i].insert(pos, "_"); std::cout << rst_name << std::endl; // 保存检测结果 cv::imwrite(rst_name, org_img); } vec_Mat = std::vector<cv::Mat>(BATCH_SIZE); delete[] out; } } std::cout << "Average processing time is " << total_time / image_list.size() << "ms" << std::endl; }
5、后处理之NMS C++实现
void V5lite::NmsDetect(std::vector<DetectRes> &detections) { sort(detections.begin(), detections.end(), [=](const DetectRes &left, const DetectRes &right) { return left.prob > right.prob; }); for (int i = 0; i < (int)detections.size(); i++) for (int j = i + 1; j < (int)detections.size(); j++) { if (detections[i].classes == detections[j].classes) { // 计算DIoU的值 float iou = IOUCalculate(detections[i], detections[j]); if (iou > nms_threshold) detections[j].prob = 0; } } detections.erase(std::remove_if(detections.begin(), detections.end(), [](const DetectRes &det) { return det.prob == 0; }), detections.end()); } // 计算 DIOU float v5Lite::IOUCalculate(const YOLOv5::DetectRes &det_a, const YOLOv5::DetectRes &det_b) { cv::Point2f center_a(det_a.x, det_a.y); cv::Point2f center_b(det_b.x, det_b.y); // 计算左上角角点坐标 cv::Point2f left_up(std::min(det_a.x - det_a.w / 2, det_b.x - det_b.w / 2), std::min(det_a.y - det_a.h / 2, det_b.y - det_b.h / 2)); // 计算右下角角点坐标 cv::Point2f right_down(std::max(det_a.x + det_a.w / 2, det_b.x + det_b.w / 2), std::max(det_a.y + det_a.h / 2, det_b.y + det_b.h / 2)); // 计算框的中心点距离 float distance_d = (center_a - center_b).x * (center_a - center_b).x + (center_a - center_b).y * (center_a - center_b).y; // 计算框的角点距离 float distance_c = (left_up - right_down).x * (left_up - right_down).x + (left_up - right_down).y * (left_up - right_down).y; float inter_l = det_a.x - det_a.w / 2 > det_b.x - det_b.w / 2 ? det_a.x - det_a.w / 2 : det_b.x - det_b.w / 2; float inter_t = det_a.y - det_a.h / 2 > det_b.y - det_b.h / 2 ? det_a.y - det_a.h / 2 : det_b.y - det_b.h / 2; float inter_r = det_a.x + det_a.w / 2 < det_b.x + det_b.w / 2 ? det_a.x + det_a.w / 2 : det_b.x + det_b.w / 2; float inter_b = det_a.y + det_a.h / 2 < det_b.y + det_b.h / 2 ? det_a.y + det_a.h / 2 : det_b.y + det_b.h / 2; if (inter_b < inter_t || inter_r < inter_l) return 0; // 计算交集 float inter_area = (inter_b - inter_t) * (inter_r - inter_l); // 计算并集 float union_area = det_a.w * det_a.h + det_b.w * det_b.h - inter_area; if (union_area == 0) return 0; else return inter_area / union_area - distance_d / distance_c; }
CMakeLists.txt如下:
cmake_minimum_required(VERSION 3.5) project(v5lite_trt) set(CMAKE_CXX_STANDARD 14) # CUDA find_package(CUDA REQUIRED) message(STATUS "Find CUDA include at ${CUDA_INCLUDE_DIRS}") message(STATUS "Find CUDA libraries: ${CUDA_LIBRARIES}") # TensorRT set(TENSORRT_ROOT "/home/chaucer/TensorRT-8.0.1.6") find_path(TENSORRT_INCLUDE_DIR NvInfer.h HINTS ${TENSORRT_ROOT} PATH_SUFFIXES include/) message(STATUS "Found TensorRT headers at ${TENSORRT_INCLUDE_DIR}") find_library(TENSORRT_LIBRARY_INFER nvinfer HINTS ${TENSORRT_ROOT} ${TENSORRT_BUILD} ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES lib lib64 lib/x64) find_library(TENSORRT_LIBRARY_ONNXPARSER nvonnxparser HINTS ${TENSORRT_ROOT} ${TENSORRT_BUILD} ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES lib lib64 lib/x64) set(TENSORRT_LIBRARY ${TENSORRT_LIBRARY_INFER} ${TENSORRT_LIBRARY_ONNXPARSER}) message(STATUS "Find TensorRT libs: ${TENSORRT_LIBRARY}") # OpenCV find_package(OpenCV REQUIRED) message(STATUS "Find OpenCV include at ${OpenCV_INCLUDE_DIRS}") message(STATUS "Find OpenCV libraries: ${OpenCV_LIBRARIES}") set(COMMON_INCLUDE ./includes/common) set(YAML_INCLUDE ./includes/yaml-cpp/include) set(YAML_LIB_DIR ./includes/yaml-cpp/libs) include_directories(${CUDA_INCLUDE_DIRS} ${TENSORRT_INCLUDE_DIR} ${OpenCV_INCLUDE_DIRS} ${COMMON_INCLUDE} ${YAML_INCLUDE}) link_directories(${YAML_LIB_DIR}) add_executable(v5lite_trt main.cpp v5lite.cpp) target_link_libraries(v5lite_trt ${OpenCV_LIBRARIES} ${CUDA_LIBRARIES} ${TENSORRT_LIBRARY} yaml-cpp)
mkdir build cd build cmake .. make -j8 v5lite_trt ../config.yaml ../samples/
5、检测结果和时间
6参考
[1].https://github.com/ppogg/YOLOv5-Lite
[2].https://zhuanlan.zhihu.com/p/400545131
[3].https://github.com/ultralytics/yolov5
[4].https://zhuanlan.zhihu.com/p/172121380
[5].https://zhuanlan.zhihu.com/p/143747206