tensorRT

框架
- 训练框架：tensorflow/pytorch，trained model转pb/onnx
- 推理部署框架：TensorRT加速模型推理，将pb/onnx转换成特定的文件格式，在硬件平台高速运行
- 部署流水线 PyTorch - ONNX - ONNX Runtime/TensorRT
部署难点
- dynamic shape
- 自定义算子
- 计算图（onnx/pb）与推理引擎的兼容
ONNX Runtime
- 直接对接 ONNX
- 支持多平台（Windows、Linux、MacOS、Web Browser、Android、iOS等）
- 支持GPU & CPU
- 我暂时理解为session建立在host端（cpu），然后不断向高效计算单元（GPU/NPU）launch op
TensorRT
- pipeline
  - 使用OnnxParser解析onnx
  - 创建builder、config、profile
  - serialize network into a plan
  - deserialize plan ，创建engine
  - 创建context，类似session
  - run：host - device之间的mem交互
- plugin
  - plugin是以.so的形式插入网络中的，因此不能与其他层融合
Triton：https://mp.weixin.qq.com/s/jWZuNKpVM4k5aDe2JmB-Tg
- NVIDIA于2018年开源的服务框架，可以对TensorRT生成的推理引擎进行更好的调度以及处理推理请求
- 会在本地端口创建一个server，用pytritonclient将需要预测的数据访问相应端口进行推理

yolo实践：https://ost.51cto.com/posts/18986

图像预处理：长边resize、padding、归一化

cv::Mat input_image = cv::imread("dog.jpg");
cv::Mat resize_image;
const int model_width = 640;
const int model_height = 640;
const float ratio = std::min(model_width / (input_image.cols * 1.0f),
                              model_height / (input_image.rows * 1.0f));

const int border_width = input_image.cols * ratio;
const int border_height = input_image.rows * ratio;

const int x_offset = (model_width - border_width) / 2;
const int y_offset = (model_height - border_height) / 2;
cv::resize(input_image, resize_image, cv::Size(border_width, border_height));
cv::copyMakeBorder(resize_image, resize_image, y_offset, y_offset, x_offset,
                    x_offset, cv::BORDER_CONSTANT, cv::Scalar(114, 114, 114));

cv::cvtColor(resize_image, resize_image, cv::COLOR_BGR2RGB);

input_blob = new float[model_height * model_width * 3];     // CHW
const int channels = resize_image.channels();
const int width = resize_image.cols;
const int height = resize_image.rows;
for (int c = 0; c < channels; c++) {
  for (int h = 0; h < height; h++) {
    for (int w = 0; w < width; w++) {
      input_blob[c * width * height + h * width + w] =
          resize_image.at<cv::Vec3b>(h, w)[c] / 255.0f;
    }
  }
}

模型序列化

#include "NvInfer.h"
# include "NvOnnxParser.h"


// logger
class MyLogger : public nvinfer1::ILogger {
public:
  explicit MyLogger(nvinfer1::ILogger::Severity severity =
                        nvinfer1::ILogger::Severity::kWARNING)
      : severity_(severity) {}

  void log(nvinfer1::ILogger::Severity severity,
           const char *msg) noexcept override {
    if (severity <= severity_) {
      std::cerr << msg << std::endl;
    }
  }
  nvinfer1::ILogger::Severity severity_;
};

// builder
MyLogger logger;
nvinfer1::IBuilder *builder = nvinfer1::createInferBuilder(logger);

// network
const uint32_t explicit_batch = 1U << static_cast<uint32_t>(
          nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
nvinfer1::INetworkDefinition *network = builder->createNetworkV2(explicit_batch);

// parse
const std::string model_path = "yolov5m.onnx";
nvonnxparser::IParser *parser = nvonnxparser::createParser(*network, logger);
parser->parseFromFile(model_path.c_str(),
    static_cast<int>(nvinfer1::ILogger::Severity::kERROR))
// 如果有错误则输出错误信息
for (int32_t i = 0; i < parser->getNbErrors(); ++i) {
    std::cout << parser->getError(i)->desc() << std::endl;
}

// build config: mem/precision
nvinfer1::IBuilderConfig *config = builder->createBuilderConfig();
config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 1U << 25);
if (builder->platformHasFastFp16()) {
  config->setFlag(nvinfer1::BuilderFlag::kFP16);
}

// serialize: tensorRT执行构建优化
nvinfer1::IHostMemory *serialized_model =
      builder->buildSerializedNetwork(*network, *config);

// save engine
std::stringstream engine_file_stream;
engine_file_stream.seekg(0, engine_file_stream.beg);
engine_file_stream.write(static_cast<const char *>(serialized_model->data()),
                        serialized_model->size());
const std::string engine_file_path = "yolov5m.engine";
std::ofstream out_file(engine_file_path);
assert(out_file.is_open());
out_file << engine_file_stream.rdbuf();
out_file.close();

// engine file构建好了以后前面的builder/network/config/parser啥的都不需要了
delete config;
delete parser;
delete network;
delete builder;

模型反序列化

// runtime deserialize from a serialize object
nvinfer1::IRuntime *runtime = nvinfer1::createInferRuntime(logger);
nvinfer1::ICudaEngine *engine = runtime->deserializeCudaEngine(
    serialized_model->data(), serialized_model->size());

delete serialized_model;
delete runtime;

// or load engine file
const std::string engine_file_path = "yolov5m.engine";
std::stringstream engine_file_stream;
engine_file_stream.seekg(0, engine_file_stream.beg);
std::ifstream ifs(engine_file_path);
engine_file_stream << ifs.rdbuf();
ifs.close();

engine_file_stream.seekg(0, std::ios::end);
const int model_size = engine_file_stream.tellg();
engine_file_stream.seekg(0, std::ios::beg);
void *model_mem = malloc(model_size);
engine_file_stream.read(static_cast<char *>(model_mem), model_size);

nvinfer1::IRuntime *runtime = nvinfer1::createInferRuntime(logger);
nvinfer1::ICudaEngine *engine = runtime->deserializeCudaEngine(model_mem, model_size);

delete runtime;
free(model_mem);

模型推理

// context管理推理
nvinfer1::IExecutionContext *context = engine->createExecutionContext();

// mem prep
void *buffers[2];
// 获取模型输入尺寸并分配GPU内存
nvinfer1::Dims input_dim = engine->getBindingDimensions(0);
int input_size = 1;
for (int j = 0; j < input_dim.nbDims; ++j) {
  input_size *= input_dim.d[j];
}
cudaMalloc(&buffers[0], input_size * sizeof(float));
// 获取模型输出尺寸并分配GPU内存
nvinfer1::Dims output_dim = engine->getBindingDimensions(1);
int output_size = 1;
for (int j = 0; j < output_dim.nbDims; ++j) {
  output_size *= output_dim.d[j];
}
cudaMalloc(&buffers[1], output_size * sizeof(float));
// 给模型输出数据分配相应的CPU内存
float *output_buffer = new float[output_size]();

// cuda stream, enqueue
cudaStream_t stream;
cudaStreamCreate(&stream);
cudaMemcpyAsync(buffers[0], input_blob,input_size * sizeof(float),   // host to device
                  cudaMemcpyHostToDevice, stream);
context->enqueueV2(buffers, stream, nullptr);
cudaMemcpyAsync(output_buffer, buffers[1],output_size * sizeof(float),   // device to host
                  cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);