3、【KV260开发】Vitis AI library APIs

vitis ai 提供了C/Python两种接口,两种接口函数名称类似,之后的内容主要由C++/C讲解。


Vitis AI Library提供了以下四种API:

  • Vitis AI Library API_0 based on VART

Vitis AI Library API_1 based on AI Library

Vitis AI Library API_2 based on DpuTask

Vitis AI Library API_3 based on Graph_runner



如果使用VART(Vitisi AI Runtime Library)进行代码的编写,流程如下:



 * @brief Run DPU Task for ResNet50
 * @param taskResnet50 - pointer to ResNet50 Task
 * @return none
void runResnet50(vart::Runner* runner) {
  /* Mean value for ResNet50 specified in Caffe prototxt */
  vector<string> kinds, images;

  /* Load all image names.*/
  ListImages(baseImagePath, images);
  if (images.size() == 0) {
    cerr << "\nError: No images existing under " << baseImagePath << endl;

  /* Load all kinds words.*/
  LoadWords(wordsPath + "words.txt", kinds);
  if (kinds.size() == 0) {
    cerr << "\nError: No words exist in file words.txt." << endl;
  float mean[3] = {104, 107, 123};

  /* get in/out tensors and dims*/
  auto outputTensors = runner->get_output_tensors();
  auto inputTensors = runner->get_input_tensors();
  auto out_dims = outputTensors[0]->get_shape();
  auto in_dims = inputTensors[0]->get_shape();

  auto input_scale = get_input_scale(inputTensors[0]);
  auto output_scale = get_output_scale(outputTensors[0]);

  /*get shape info*/
  int outSize = shapes.outTensorList[0].size;
  int inSize = shapes.inTensorList[0].size;
  int inHeight = shapes.inTensorList[0].height;
  int inWidth = shapes.inTensorList[0].width;

  int batchSize = in_dims[0];

  std::vector<std::unique_ptr<vart::TensorBuffer>> inputs, outputs;

  vector<Mat> imageList;
  int8_t* imageInputs = new int8_t[inSize * batchSize];

  float* softmax = new float[outSize];
  int8_t* FCResult = new int8_t[batchSize * outSize];
  std::vector<vart::TensorBuffer*> inputsPtr, outputsPtr;
  std::vector<std::shared_ptr<xir::Tensor>> batchTensors;
  /*run with batch*/
  for (unsigned int n = 0; n < images.size(); n += batchSize) {
    unsigned int runSize =
        (images.size() < (n + batchSize)) ? (images.size() - n) : batchSize;
    in_dims[0] = runSize;
    out_dims[0] = batchSize;
    for (unsigned int i = 0; i < runSize; i++) {
      Mat image = imread(baseImagePath + images[n + i]);

      /*image pre-process*/
      Mat image2;  //= cv::Mat(inHeight, inWidth, CV_8SC3);
      resize(image, image2, Size(inHeight, inWidth), 0, 0);
      for (int h = 0; h < inHeight; h++) {
        for (int w = 0; w < inWidth; w++) {
          for (int c = 0; c < 3; c++) {
            imageInputs[i * inSize + h * inWidth * 3 + w * 3 + c] =
                (int8_t)((image2.at<Vec3b>(h, w)[c] - mean[c]) * input_scale);

    /* in/out tensor refactory for batch inout/output */
        xir::Tensor::create(inputTensors[0]->get_name(), in_dims,
                            xir::DataType{xir::DataType::XINT, 8u})));
        imageInputs, batchTensors.back().get()));
        xir::Tensor::create(outputTensors[0]->get_name(), out_dims,
                            xir::DataType{xir::DataType::XINT, 8u})));
        FCResult, batchTensors.back().get()));

    /*tensor buffer input/output */

    auto job_id = runner->execute_async(inputsPtr, outputsPtr);
    runner->wait(job_id.first, -1);
    for (unsigned int i = 0; i < runSize; i++) {
      cout << "\nImage : " << images[n + i] << endl;
      /* Calculate softmax on CPU and display TOP-5 classification results */
      CPUCalcSoftmax(&FCResult[i * outSize], outSize, softmax, output_scale);
      TopK(softmax, outSize, 5, kinds);
      /* Display the impage */
      bool quiet = (getenv("QUIET_RUN") != nullptr);
      if (!quiet) {
        cv::imshow("Classification of ResNet50", imageList[i]);
  delete[] FCResult;
  delete[] imageInputs;
  delete[] softmax;

 * @brief Entry for runing ResNet50 neural network
 * @note Runner APIs prefixed with "dpu" are used to easily program &
 *       deploy ResNet50 on DPU platform.
int main(int argc, char* argv[]) {
  // Check args
  if (argc != 2) {
    cout << "Usage of resnet50 demo: ./resnet50 [model_file]" << endl;
    return -1;
  auto graph = xir::Graph::deserialize(argv[1]);
  auto subgraph = get_dpu_subgraph(graph.get());
  CHECK_EQ(subgraph.size(), 1u)
      << "resnet50 should have one and only one dpu subgraph.";
  LOG(INFO) << "create running for subgraph: " << subgraph[0]->get_name();
  /*create runner*/
  auto runner = vart::Runner::create_runner(subgraph[0], "run");
  // ai::XdpuRunner* runner = new ai::XdpuRunner("./");
  /*get in/out tensor*/
  auto inputTensors = runner->get_input_tensors();
  auto outputTensors = runner->get_output_tensors();

  /*get in/out tensor shape*/
  int inputCnt = inputTensors.size();
  int outputCnt = outputTensors.size();
  TensorShape inshapes[inputCnt];
  TensorShape outshapes[outputCnt];
  shapes.inTensorList = inshapes;
  shapes.outTensorList = outshapes;
  getTensorShape(runner.get(), &shapes, inputCnt, outputCnt);

  /*run with batch*/
  return 0;

AI Library

当使用的模型在Vitis AI/Model ZOO中时,可以直接复用相应的模型demo,举个例子yolov3:

int main(int argc, char *argv[]) {
  if (argc < 2) {
    cerr << "usage: " << argv[0] << " image_file_url " << endl;
  Mat img = cv::imread(argv[2]);
  if (img.empty()) {
    cerr << "cannot load " << argv[2] << endl;

  auto yolo = vitis::ai::YOLOv3::create(argv[1], true);

  //  auto yolo =
  //    vitis::ai::YOLOv3::create(xilinx::ai::YOLOV3_VOC_416x416_TF, true);

  auto results = yolo->run(img);

  for (auto &box : results.bboxes) {
    int label = box.label;
    float xmin = box.x * img.cols + 1;
    float ymin = box.y * img.rows + 1;
    float xmax = xmin + box.width * img.cols;
    float ymax = ymin + box.height * img.rows;
    if (xmin < 0.) xmin = 1.;
    if (ymin < 0.) ymin = 1.;
    if (xmax > img.cols) xmax = img.cols;
    if (ymax > img.rows) ymax = img.rows;
    float confidence = box.score;

    cout << "RESULT: " << label << "\t" << xmin << "\t" << ymin << "\t" << xmax
         << "\t" << ymax << "\t" << confidence << "\n";
    rectangle(img, Point(xmin, ymin), Point(xmax, ymax), Scalar(0, 255, 0), 1,
              1, 0);
  //    imshow("", img);
  //    waitKey(0);
  imwrite("result.jpg", img);

  return 0;


DPU Task

如果是使用DPU Task,那么可以直接参考yolov3这个例子:

//origin: Vitis-AI/demo/Vitis-AI-Library/samples/

// The parameters of yolov3_voc, each value could be set as actual needs.
const string yolov3_config = {
    "   name: \"yolov3_voc_416\" \n"
    "   model_type : YOLOv3 \n"
    "   yolo_v3_param { \n"
    "     num_classes: 20 \n"
    "     anchorCnt: 3 \n"
    "     conf_threshold: 0.3 \n"
    "     nms_threshold: 0.45 \n"
    "     layer_name: \"81\" \n"
    "     layer_name: \"93\" \n"
    "     layer_name: \"105\" \n"
    "     biases: 10 \n"
    "     biases: 13 \n"
    "     biases: 16 \n"
    "     biases: 30 \n"
    "     biases: 33 \n"
    "     biases: 23 \n"
    "     biases: 30 \n"
    "     biases: 61 \n"
    "     biases: 62 \n"
    "     biases: 45 \n"
    "     biases: 59 \n"
    "     biases: 119 \n"
    "     biases: 116 \n"
    "     biases: 90 \n"
    "     biases: 156 \n"
    "     biases: 198 \n"
    "     biases: 373 \n"
    "     biases: 326 \n"
    "     test_mAP: false \n"
    "   } \n"};

int main(int argc, char* argv[]) {
  // argv[1]是xmodel的位置
  auto kernel_name = argv[1];

  // Read image from a path.
  vector<Mat> imgs;
  vector<string> imgs_names;
  for (int i = 2; i < argc; i++) {
    // image file names.
    auto img = cv::imread(argv[i]);
    if (img.empty()) {
      std::cout << "Cannot load " << argv[i] << std::endl;
  if (imgs.empty()) {
    std::cerr << "No image load success!" << std::endl;
  // Create a dpu task object.
  auto task = vitis::ai::DpuTask::create(kernel_name);
  auto batch = task->get_input_batch(0, 0);
  // Set the mean values and scale values.
  task->setMeanScaleBGR({0.0f, 0.0f, 0.0f},
                        {0.00390625f, 0.00390625f, 0.00390625f});
  auto input_tensor = task->getInputTensor(0u);
  CHECK_EQ((int)input_tensor.size(), 1)
      << " the dpu model must have only one input";
  auto width = input_tensor[0].width;
  auto height = input_tensor[0].height;
  auto size = cv::Size(width, height);
  // Create a config and set the correlating data to control post-process.
  vitis::ai::proto::DpuModelParam config;
  // Fill all the parameters.
  auto ok =
      google::protobuf::TextFormat::ParseFromString(yolov3_config, &config);
  if (!ok) {
    cerr << "Set parameters failed!" << endl;

  vector<Mat> inputs;
  vector<int> input_cols, input_rows;
  for (long unsigned int i = 0, j = -1; i < imgs.size(); i++) {
    /* Pre-process Part */
    // Resize it if its size is not match.
    cv::Mat image;
    if (size != imgs[i].size()) {
      cv::resize(imgs[i], image, size);
    } else {
      image = imgs[i];
    if (j < batch - 1 && i < imgs.size() - 1) {

    // Set the input images into dpu.

    /* DPU Runtime */
    // Run the dpu.

    /* Post-process part */
    // Get output.
    auto output_tensor = task->getOutputTensor(0u);
    // Execute the yolov3 post-processing.
    auto results = vitis::ai::yolov3_post_process(
        input_tensor, output_tensor, config, input_cols, input_rows);

    /* Print the results */
    // Convert coordinate and draw boxes at origin image.
    for (int k = 0; k < static_cast<int>(inputs.size()); k++) {
      cout << "batch_index " << k << " "  //
           << "image_name " << imgs_names[i - j + k] << endl;
      for (auto& box : results[k].bboxes) {
        int label = box.label;
        float xmin = box.x * input_cols[k] + 1;
        float ymin = box.y * input_rows[k] + 1;
        float xmax = xmin + box.width * input_cols[k];
        float ymax = ymin + box.height * input_rows[k];
        if (xmin < 0.) xmin = 1.;
        if (ymin < 0.) ymin = 1.;
        if (xmax > input_cols[k]) xmax = input_cols[k];
        if (ymax > input_rows[k]) ymax = input_rows[k];
        float confidence = box.score;

        cout << "RESULT: " << label << "\t" << xmin << "\t" << ymin << "\t"
             << xmax << "\t" << ymax << "\t" << confidence << "\n";
        rectangle(imgs[i - j + k], Point(xmin, ymin), Point(xmax, ymax),
                  Scalar(0, 255, 0), 1, 1, 0);
      imwrite(imgs_names[i - j + k] + "_result.jpg", imgs[i - j + k]);
    j = -1;
  return 0;

Graph Runer


如果真出现了多个subgraph,不妨先升级到vitis ai 1.4,1.3版本有bug。如果1.4版本依旧是多个subgraph,首先检查算子兼容性4、算子兼容性,通过 Vitis-AI/tools/Vitis-AI-Library/cpu_task/ops 可以查询支持的算子。




const std::vector<std::string> charactor_0 = {
   "unknown", "jing", "hu", "jin", "yu", "ji", "jin", "meng", "liao", "ji",
         "hei", "su", "zhe", "wan", "min", "gan",
         "lu", "yu", "e", "xiang", "yue", "gui", "qiong", "chuan", "gui", "yun",
         "zang", "shan", "gan", "qing", "ning", "xin"};

const std::vector<std::string> charactor_1 = {
   "unknown", "A", "B", "C", "D", "E", "F", "G", "H", "J", "K", "L",
         "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"};

const std::vector<std::string> charactor_2 = {
   "unknown", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "A", "B",
         "C", "D", "E",
         "F", "G", "H", "J", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "U",
         "V", "W", "X", "Y", "Z"};

const std::vector<std::string> color = {"Blue", "Yellow"};

static int get_fix_point(const xir::Tensor* tensor);
static std::vector<std::int32_t> get_index_zeros(const xir::Tensor* tensor);
static std::vector<cv::Mat> read_images(const std::vector<std::string>& files,
                                        size_t batch);
static void set_input_image(const cv::Mat& image, void* data1, float scale);
static std::vector<std::pair<int, float>> topk(void* data1, size_t size,
                                               int K);
static size_t find_tensor_index(const char* tensor_name,
                             const std::vector<vart::TensorBuffer*>& outputs);
//platenum preprocess
static void preprocess_platenum(const std::vector<std::string>& files, 
                    const std::vector<vart::TensorBuffer*>& input_tensor_buffers) {
  auto input_tensor = input_tensor_buffers[0]->get_tensor();
  auto batch = input_tensor->get_shape().at(0);
  auto height = input_tensor->get_shape().at(1);
  auto width = input_tensor->get_shape().at(2);

  int fixpos = get_fix_point(input_tensor);
  float input_fixed_scale = std::exp2f(1.0f * (float)fixpos);

  auto size = cv::Size(width, height);
  auto images = read_images(files, batch);
  CHECK_EQ(images.size(), batch) 
    << "images number be read into input buffer must be equal to batch";

  for (int index = 0; index < batch; ++index) {
    cv::Mat resize_image;
    if (size != images[index].size()) {
      cv::resize(images[index], resize_image, size, 0);
    } else {
    uint64_t data_in = 0u;
    size_t size_in = 0u;
    auto idx = get_index_zeros(input_tensor);
    idx[0] = (int)index;
    std::tie(data_in, size_in) = input_tensor_buffers[0]->data(idx);
    set_input_image(resize_image, (void*)data_in, input_fixed_scale);

//platenum postprocess
static void postprocess_platenum(const std::vector<vart::TensorBuffer*>& output_tensor_buffers) {
  auto output_tensor = output_tensor_buffers[0]->get_tensor();
  auto batch = output_tensor->get_shape().at(0);
  auto size = output_tensor_buffers.size();
  CHECK_EQ(size, 8) << "output_tensor_buffers.size() must be 8";
  for (auto i = 1u; i < size; ++i) {
    CHECK_EQ(output_tensor_buffers[i]->get_tensor()->get_shape().at(0), batch) 
      << "all output_tensor_buffer batch number must be equal";
  std::vector<std::pair<int, float>> ret;
  for (int batch_index = 0; batch_index < batch; ++batch_index) {
    for (auto tb_index = 0u; tb_index < size; ++tb_index) {
      uint64_t data_out = 0u;
      size_t size_out = 0u;
      auto idx = get_index_zeros(output_tensor_buffers[tb_index]->get_tensor());
      idx[0] = (int)batch_index;
      std::tie(data_out, size_out) = output_tensor_buffers[tb_index]->data(idx);
      auto elem_num = output_tensor_buffers[tb_index]->get_tensor()->get_element_num() / batch;
      auto tb_top1 = topk((void*)data_out, elem_num, 1)[0];
  for (int batch_index = 0; batch_index < batch; ++batch_index) {
    std::string plate_number = "";
    std::string plate_color = "";
    //output_tensor_buffers maybe out of order, need find correct output_tensor_buffer result by tensor name
    plate_number += charactor_0[ret[batch_index * size + find_tensor_index("prob1", output_tensor_buffers)].first];
    plate_number += charactor_1[ret[batch_index * size + find_tensor_index("prob2", output_tensor_buffers)].first];
    plate_number += charactor_2[ret[batch_index * size + find_tensor_index("prob3", output_tensor_buffers)].first];
    plate_number += charactor_2[ret[batch_index * size + find_tensor_index("prob4", output_tensor_buffers)].first];
    plate_number += charactor_2[ret[batch_index * size + find_tensor_index("prob5", output_tensor_buffers)].first];
    plate_number += charactor_2[ret[batch_index * size + find_tensor_index("prob6", output_tensor_buffers)].first];
    plate_number += charactor_2[ret[batch_index * size + find_tensor_index("prob7", output_tensor_buffers)].first];
    plate_color = color[ret[batch_index * size + find_tensor_index("prob8", output_tensor_buffers)].first];
    std::cout << "batch_index: " << batch_index << std::endl;
    std::cout << "plate_color: " << plate_color << std::endl;
    std::cout << "plate_number: " << plate_number << std::endl;

int main(int argc, char* argv[]) {
  if (argc < 3) {
    std::cerr << "usage :" << argv[0] << " <model_name>"
              << " <image_url> [<image_url> ...]" << std::endl;
  std::string g_xmodel_file = std::string(argv[1]);
  std::vector<std::string> g_image_files;
  for (auto i = 2; i < argc; i++) {

  //create graph runner
  auto graph = xir::Graph::deserialize(g_xmodel_file);
  auto attrs = xir::Attrs::create();
  auto runner =
      vitis::ai::GraphRunner::create_graph_runner(graph.get(), attrs.get());
  CHECK(runner != nullptr);

  //get input/output tensor buffers
  auto input_tensor_buffers = runner->get_inputs();
  auto output_tensor_buffers = runner->get_outputs();

  //preprocess and fill input
  preprocess_platenum(g_image_files, input_tensor_buffers);

  //sync input tensor buffers
  for (auto& input : input_tensor_buffers) {
      input->sync_for_write(0, input->get_tensor()->get_data_size() /

  //run graph runner
  auto v = runner->execute_async(input_tensor_buffers, output_tensor_buffers);
  auto status = runner->wait((int)v.first, -1);
  CHECK_EQ(status, 0) << "failed to run the graph";

  //sync output tensor buffers
  for (auto output : output_tensor_buffers) {
      output->sync_for_read(0, output->get_tensor()->get_data_size() /

  //postprocess and print platenum result

  return 0;

static int get_fix_point(const xir::Tensor* tensor) {
        << "get tensor fix_point error! has no fix_point attr, tensor name is "
        << tensor->get_name();
  return tensor->template get_attr<int>("fix_point");

static std::vector<std::int32_t> get_index_zeros(const xir::Tensor* tensor) {
  auto ret = tensor->get_shape();
  std::fill(ret.begin(), ret.end(), 0);
  return ret;

static std::vector<cv::Mat> read_images(const std::vector<std::string>& files,
                                            size_t batch) {
  std::vector<cv::Mat> images(batch);
  for (auto index = 0u; index < batch; ++index) {
    const auto& file = files[index % files.size()];
    images[index] = cv::imread(file);
    CHECK(!images[index].empty()) << "cannot read image from " << file;
  return images;

static void set_input_image(const cv::Mat& image, void* data1, float scale) {
  float mean[3] = {128.0, 128.0, 128.0};
  signed char* data = (signed char*)data1;
  for (int h = 0; h < image.rows; h++) {
    for (int w = 0; w < image.cols; w++) {
      for (int c = 0; c < 3; c++) {
        auto image_data = (image.at<cv::Vec3b>(h, w)[c] - mean[c]) * scale;
        image_data = std::max(std::min(image_data, 127.0f), -128.0f);
        data[h * image.cols * 3 + w * 3 + c] = (int)image_data;

static std::vector<std::pair<int, float>> topk(void* data1, size_t size,
                                                   int K) {
  const float* score = (const float*)data1;
  auto indices = std::vector<int>(size);
  std::iota(indices.begin(), indices.end(), 0);
  std::partial_sort(indices.begin(), indices.begin() + K, indices.end(),
                    [&score](int a, int b) { return score[a] > score[b]; });
  auto ret = std::vector<std::pair<int, float>>(K);
      indices.begin(), indices.begin() + K, ret.begin(),
      [&score](int index) { return std::make_pair(index, score[index]); });
  return ret;

static size_t find_tensor_index(const char* tensor_name,
                         const std::vector<vart::TensorBuffer*>& outputs) {
  auto it = std::find_if(outputs.begin(), outputs.end(),
                         [&tensor_name](const vart::TensorBuffer* tb) {
                         return tb->get_tensor()->get_name() == tensor_name;
  CHECK(it != outputs.end()) << "cannot find tensorbuffer. tensor_name=" << tensor_name;
  return it - outputs.begin();




name: "yolov3_voc"
kernel {
name: "yolov3_voc"
mean: 0.0
mean: 0.0
mean: 0.0
scale: 0.00390625
scale: 0.00390625
scale: 0.00390625
model_type : YOLOv3
yolo_v3_param {
num_classes: 20
anchorCnt: 3
layer_name: "59"
layer_name: "67"
layer_name: "75"
conf_threshold: 0.3
nms_threshold: 0.45
biases: 10
biases: 13
biases: 16
biases: 30
biases: 33
biases: 23
biases: 30
biases: 61
biases: 62
biases: 45
biases: 59
biases: 119
biases: 116
biases: 90
biases: 156
biases: 198
biases: 373
biases: 326
test_mAP: false
is_tf: false







• Classification

• Face detection

• Face landmark detection

• SSD detection

• Pose detection

• Semantic segmentation

• Road line detection

• YOLOv3 detection

• YOLOv2 detection

• Openpose detection

• RefineDet detection

• ReID detection

• Multi-task

• Face recognition

• Plate detection

• Plate recognition

• Medical segmentation

• Medical detection

• Face quality

• Hourglass

• Retinaface

• Centerpoint

• Multitaskv3

• Pointpillars_nuscenes

• Rcan





