tensorRT

  1. 框架

    • 训练框架:tensorflow/pytorch,trained model转pb/onnx
    • 推理部署框架:TensorRT加速模型推理,将pb/onnx转换成特定的文件格式,在硬件平台高速运行
    • 部署流水线 PyTorch - ONNX - ONNX Runtime/TensorRT
  2. 部署难点

    • dynamic shape
    • 自定义算子
    • 计算图(onnx/pb)与推理引擎的兼容
  3. ONNX Runtime

    • 直接对接 ONNX

    • 支持多平台(Windows、Linux、MacOS、Web Browser、Android、iOS等)

    • 支持GPU & CPU
    • 我暂时理解为session建立在host端(cpu),然后不断向高效计算单元(GPU/NPU)launch op
  4. TensorRT

    • pipeline
      • 使用OnnxParser解析onnx
      • 创建builder、config、profile
      • serialize network into a plan
      • deserialize plan ,创建engine
      • 创建context,类似session
      • run:host - device之间的mem交互
    • plugin
      • plugin是以.so的形式插入网络中的,因此不能与其他层融合
  5. Triton:https://mp.weixin.qq.com/s/jWZuNKpVM4k5aDe2JmB-Tg

    • NVIDIA于2018年开源的服务框架,可以对TensorRT生成的推理引擎进行更好的调度以及处理推理请求
    • 会在本地端口创建一个server,用pytritonclient将需要预测的数据访问相应端口进行推理
  6. yolo实践:https://ost.51cto.com/posts/18986

    • 图像预处理:长边resize、padding、归一化

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      cv::Mat input_image = cv::imread("dog.jpg");
      cv::Mat resize_image;
      const int model_width = 640;
      const int model_height = 640;
      const float ratio = std::min(model_width / (input_image.cols * 1.0f),
      model_height / (input_image.rows * 1.0f));

      const int border_width = input_image.cols * ratio;
      const int border_height = input_image.rows * ratio;

      const int x_offset = (model_width - border_width) / 2;
      const int y_offset = (model_height - border_height) / 2;
      cv::resize(input_image, resize_image, cv::Size(border_width, border_height));
      cv::copyMakeBorder(resize_image, resize_image, y_offset, y_offset, x_offset,
      x_offset, cv::BORDER_CONSTANT, cv::Scalar(114, 114, 114));

      cv::cvtColor(resize_image, resize_image, cv::COLOR_BGR2RGB);

      input_blob = new float[model_height * model_width * 3]; // CHW
      const int channels = resize_image.channels();
      const int width = resize_image.cols;
      const int height = resize_image.rows;
      for (int c = 0; c < channels; c++) {
      for (int h = 0; h < height; h++) {
      for (int w = 0; w < width; w++) {
      input_blob[c * width * height + h * width + w] =
      resize_image.at<cv::Vec3b>(h, w)[c] / 255.0f;
      }
      }
      }
    • 模型序列化

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      32
      33
      34
      35
      36
      37
      38
      39
      40
      41
      42
      43
      44
      45
      46
      47
      48
      49
      50
      51
      52
      53
      54
      55
      56
      57
      58
      59
      60
      61
      62
      63
      64
      65
      66
      #include "NvInfer.h"
      # include "NvOnnxParser.h"


      // logger
      class MyLogger : public nvinfer1::ILogger {
      public:
      explicit MyLogger(nvinfer1::ILogger::Severity severity =
      nvinfer1::ILogger::Severity::kWARNING)
      : severity_(severity) {}

      void log(nvinfer1::ILogger::Severity severity,
      const char *msg) noexcept override {
      if (severity <= severity_) {
      std::cerr << msg << std::endl;
      }
      }
      nvinfer1::ILogger::Severity severity_;
      };

      // builder
      MyLogger logger;
      nvinfer1::IBuilder *builder = nvinfer1::createInferBuilder(logger);

      // network
      const uint32_t explicit_batch = 1U << static_cast<uint32_t>(
      nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
      nvinfer1::INetworkDefinition *network = builder->createNetworkV2(explicit_batch);

      // parse
      const std::string model_path = "yolov5m.onnx";
      nvonnxparser::IParser *parser = nvonnxparser::createParser(*network, logger);
      parser->parseFromFile(model_path.c_str(),
      static_cast<int>(nvinfer1::ILogger::Severity::kERROR))
      // 如果有错误则输出错误信息
      for (int32_t i = 0; i < parser->getNbErrors(); ++i) {
      std::cout << parser->getError(i)->desc() << std::endl;
      }

      // build config: mem/precision
      nvinfer1::IBuilderConfig *config = builder->createBuilderConfig();
      config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 1U << 25);
      if (builder->platformHasFastFp16()) {
      config->setFlag(nvinfer1::BuilderFlag::kFP16);
      }

      // serialize: tensorRT执行构建优化
      nvinfer1::IHostMemory *serialized_model =
      builder->buildSerializedNetwork(*network, *config);

      // save engine
      std::stringstream engine_file_stream;
      engine_file_stream.seekg(0, engine_file_stream.beg);
      engine_file_stream.write(static_cast<const char *>(serialized_model->data()),
      serialized_model->size());
      const std::string engine_file_path = "yolov5m.engine";
      std::ofstream out_file(engine_file_path);
      assert(out_file.is_open());
      out_file << engine_file_stream.rdbuf();
      out_file.close();

      // engine file构建好了以后前面的builder/network/config/parser啥的都不需要了
      delete config;
      delete parser;
      delete network;
      delete builder;
    • 模型反序列化

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      // runtime deserialize from a serialize object
      nvinfer1::IRuntime *runtime = nvinfer1::createInferRuntime(logger);
      nvinfer1::ICudaEngine *engine = runtime->deserializeCudaEngine(
      serialized_model->data(), serialized_model->size());

      delete serialized_model;
      delete runtime;

      // or load engine file
      const std::string engine_file_path = "yolov5m.engine";
      std::stringstream engine_file_stream;
      engine_file_stream.seekg(0, engine_file_stream.beg);
      std::ifstream ifs(engine_file_path);
      engine_file_stream << ifs.rdbuf();
      ifs.close();

      engine_file_stream.seekg(0, std::ios::end);
      const int model_size = engine_file_stream.tellg();
      engine_file_stream.seekg(0, std::ios::beg);
      void *model_mem = malloc(model_size);
      engine_file_stream.read(static_cast<char *>(model_mem), model_size);

      nvinfer1::IRuntime *runtime = nvinfer1::createInferRuntime(logger);
      nvinfer1::ICudaEngine *engine = runtime->deserializeCudaEngine(model_mem, model_size);

      delete runtime;
      free(model_mem);
    • 模型推理

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      // context管理推理
      nvinfer1::IExecutionContext *context = engine->createExecutionContext();

      // mem prep
      void *buffers[2];
      // 获取模型输入尺寸并分配GPU内存
      nvinfer1::Dims input_dim = engine->getBindingDimensions(0);
      int input_size = 1;
      for (int j = 0; j < input_dim.nbDims; ++j) {
      input_size *= input_dim.d[j];
      }
      cudaMalloc(&buffers[0], input_size * sizeof(float));
      // 获取模型输出尺寸并分配GPU内存
      nvinfer1::Dims output_dim = engine->getBindingDimensions(1);
      int output_size = 1;
      for (int j = 0; j < output_dim.nbDims; ++j) {
      output_size *= output_dim.d[j];
      }
      cudaMalloc(&buffers[1], output_size * sizeof(float));
      // 给模型输出数据分配相应的CPU内存
      float *output_buffer = new float[output_size]();

      // cuda stream, enqueue
      cudaStream_t stream;
      cudaStreamCreate(&stream);
      cudaMemcpyAsync(buffers[0], input_blob,input_size * sizeof(float), // host to device
      cudaMemcpyHostToDevice, stream);
      context->enqueueV2(buffers, stream, nullptr);
      cudaMemcpyAsync(output_buffer, buffers[1],output_size * sizeof(float), // device to host
      cudaMemcpyDeviceToHost, stream);
      cudaStreamSynchronize(stream);