框架
- 训练框架:tensorflow/pytorch,trained model转pb/onnx
- 推理部署框架:TensorRT加速模型推理,将pb/onnx转换成特定的文件格式,在硬件平台高速运行
- 部署流水线 PyTorch - ONNX - ONNX Runtime/TensorRT
部署难点
- dynamic shape
- 自定义算子
- 计算图(onnx/pb)与推理引擎的兼容
ONNX Runtime
直接对接 ONNX
支持多平台(Windows、Linux、MacOS、Web Browser、Android、iOS等)
- 支持GPU & CPU
- 我暂时理解为session建立在host端(cpu),然后不断向高效计算单元(GPU/NPU)launch op
TensorRT
- pipeline
- 使用OnnxParser解析onnx
- 创建builder、config、profile
- serialize network into a plan
- deserialize plan ,创建engine
- 创建context,类似session
- run:host - device之间的mem交互
- plugin
- plugin是以.so的形式插入网络中的,因此不能与其他层融合
- pipeline
Triton:https://mp.weixin.qq.com/s/jWZuNKpVM4k5aDe2JmB-Tg
- NVIDIA于2018年开源的服务框架,可以对TensorRT生成的推理引擎进行更好的调度以及处理推理请求
- 会在本地端口创建一个server,用pytritonclient将需要预测的数据访问相应端口进行推理
yolo实践:https://ost.51cto.com/posts/18986
图像预处理:长边resize、padding、归一化
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30cv::Mat input_image = cv::imread("dog.jpg");
cv::Mat resize_image;
const int model_width = 640;
const int model_height = 640;
const float ratio = std::min(model_width / (input_image.cols * 1.0f),
model_height / (input_image.rows * 1.0f));
const int border_width = input_image.cols * ratio;
const int border_height = input_image.rows * ratio;
const int x_offset = (model_width - border_width) / 2;
const int y_offset = (model_height - border_height) / 2;
cv::resize(input_image, resize_image, cv::Size(border_width, border_height));
cv::copyMakeBorder(resize_image, resize_image, y_offset, y_offset, x_offset,
x_offset, cv::BORDER_CONSTANT, cv::Scalar(114, 114, 114));
cv::cvtColor(resize_image, resize_image, cv::COLOR_BGR2RGB);
input_blob = new float[model_height * model_width * 3]; // CHW
const int channels = resize_image.channels();
const int width = resize_image.cols;
const int height = resize_image.rows;
for (int c = 0; c < channels; c++) {
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
input_blob[c * width * height + h * width + w] =
resize_image.at<cv::Vec3b>(h, w)[c] / 255.0f;
}
}
}模型序列化
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
// logger
class MyLogger : public nvinfer1::ILogger {
public:
explicit MyLogger(nvinfer1::ILogger::Severity severity =
nvinfer1::ILogger::Severity::kWARNING)
: severity_(severity) {}
void log(nvinfer1::ILogger::Severity severity,
const char *msg) noexcept override {
if (severity <= severity_) {
std::cerr << msg << std::endl;
}
}
nvinfer1::ILogger::Severity severity_;
};
// builder
MyLogger logger;
nvinfer1::IBuilder *builder = nvinfer1::createInferBuilder(logger);
// network
const uint32_t explicit_batch = 1U << static_cast<uint32_t>(
nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
nvinfer1::INetworkDefinition *network = builder->createNetworkV2(explicit_batch);
// parse
const std::string model_path = "yolov5m.onnx";
nvonnxparser::IParser *parser = nvonnxparser::createParser(*network, logger);
parser->parseFromFile(model_path.c_str(),
static_cast<int>(nvinfer1::ILogger::Severity::kERROR))
// 如果有错误则输出错误信息
for (int32_t i = 0; i < parser->getNbErrors(); ++i) {
std::cout << parser->getError(i)->desc() << std::endl;
}
// build config: mem/precision
nvinfer1::IBuilderConfig *config = builder->createBuilderConfig();
config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 1U << 25);
if (builder->platformHasFastFp16()) {
config->setFlag(nvinfer1::BuilderFlag::kFP16);
}
// serialize: tensorRT执行构建优化
nvinfer1::IHostMemory *serialized_model =
builder->buildSerializedNetwork(*network, *config);
// save engine
std::stringstream engine_file_stream;
engine_file_stream.seekg(0, engine_file_stream.beg);
engine_file_stream.write(static_cast<const char *>(serialized_model->data()),
serialized_model->size());
const std::string engine_file_path = "yolov5m.engine";
std::ofstream out_file(engine_file_path);
assert(out_file.is_open());
out_file << engine_file_stream.rdbuf();
out_file.close();
// engine file构建好了以后前面的builder/network/config/parser啥的都不需要了
delete config;
delete parser;
delete network;
delete builder;模型反序列化
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27// runtime deserialize from a serialize object
nvinfer1::IRuntime *runtime = nvinfer1::createInferRuntime(logger);
nvinfer1::ICudaEngine *engine = runtime->deserializeCudaEngine(
serialized_model->data(), serialized_model->size());
delete serialized_model;
delete runtime;
// or load engine file
const std::string engine_file_path = "yolov5m.engine";
std::stringstream engine_file_stream;
engine_file_stream.seekg(0, engine_file_stream.beg);
std::ifstream ifs(engine_file_path);
engine_file_stream << ifs.rdbuf();
ifs.close();
engine_file_stream.seekg(0, std::ios::end);
const int model_size = engine_file_stream.tellg();
engine_file_stream.seekg(0, std::ios::beg);
void *model_mem = malloc(model_size);
engine_file_stream.read(static_cast<char *>(model_mem), model_size);
nvinfer1::IRuntime *runtime = nvinfer1::createInferRuntime(logger);
nvinfer1::ICudaEngine *engine = runtime->deserializeCudaEngine(model_mem, model_size);
delete runtime;
free(model_mem);模型推理
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31// context管理推理
nvinfer1::IExecutionContext *context = engine->createExecutionContext();
// mem prep
void *buffers[2];
// 获取模型输入尺寸并分配GPU内存
nvinfer1::Dims input_dim = engine->getBindingDimensions(0);
int input_size = 1;
for (int j = 0; j < input_dim.nbDims; ++j) {
input_size *= input_dim.d[j];
}
cudaMalloc(&buffers[0], input_size * sizeof(float));
// 获取模型输出尺寸并分配GPU内存
nvinfer1::Dims output_dim = engine->getBindingDimensions(1);
int output_size = 1;
for (int j = 0; j < output_dim.nbDims; ++j) {
output_size *= output_dim.d[j];
}
cudaMalloc(&buffers[1], output_size * sizeof(float));
// 给模型输出数据分配相应的CPU内存
float *output_buffer = new float[output_size]();
// cuda stream, enqueue
cudaStream_t stream;
cudaStreamCreate(&stream);
cudaMemcpyAsync(buffers[0], input_blob,input_size * sizeof(float), // host to device
cudaMemcpyHostToDevice, stream);
context->enqueueV2(buffers, stream, nullptr);
cudaMemcpyAsync(output_buffer, buffers[1],output_size * sizeof(float), // device to host
cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);