113 lines
4.7 KiB
C++

// modify from
// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
#ifndef TRT_KERNEL_H
#define TRT_KERNEL_H
#include <cuda_runtime.h>
#include <cassert>
#include <cstdio>
#include "cublas_v2.h"
#include "trt_plugin_helper.hpp"
using namespace nvinfer1;
#define DEBUG_ENABLE 0
template <typename T>
struct Bbox {
T xmin, ymin, xmax, ymax;
Bbox(T xmin, T ymin, T xmax, T ymax)
: xmin(xmin), ymin(ymin), xmax(xmax), ymax(ymax) {}
Bbox() = default;
};
template <typename T>
struct BboxInfo {
T conf_score;
int label;
int bbox_idx;
bool kept;
BboxInfo(T conf_score, int label, int bbox_idx, bool kept)
: conf_score(conf_score), label(label), bbox_idx(bbox_idx), kept(kept) {}
BboxInfo() = default;
};
int8_t* alignPtr(int8_t* ptr, uintptr_t to);
int8_t* nextWorkspacePtr(int8_t* ptr, uintptr_t previousWorkspaceSize);
void setUniformOffsets(cudaStream_t stream, int num_segments, int offset,
int* d_offsets);
pluginStatus_t allClassNMS(cudaStream_t stream, int num, int num_classes,
int num_preds_per_class, int top_k,
float nms_threshold, bool share_location,
bool isNormalized, DataType DT_SCORE,
DataType DT_BBOX, void* bbox_data,
void* beforeNMS_scores, void* beforeNMS_index_array,
void* afterNMS_scores, void* afterNMS_index_array,
bool flipXY = false);
size_t detectionForwardBBoxDataSize(int N, int C1, DataType DT_BBOX);
size_t detectionForwardBBoxPermuteSize(bool shareLocation, int N, int C1,
DataType DT_BBOX);
size_t sortScoresPerClassWorkspaceSize(int num, int num_classes,
int num_preds_per_class,
DataType DT_CONF);
size_t sortScoresPerImageWorkspaceSize(int num_images, int num_items_per_image,
DataType DT_SCORE);
pluginStatus_t sortScoresPerImage(cudaStream_t stream, int num_images,
int num_items_per_image, DataType DT_SCORE,
void* unsorted_scores,
void* unsorted_bbox_indices,
void* sorted_scores,
void* sorted_bbox_indices, void* workspace);
pluginStatus_t sortScoresPerClass(cudaStream_t stream, int num, int num_classes,
int num_preds_per_class,
int background_label_id,
float confidence_threshold, DataType DT_SCORE,
void* conf_scores_gpu, void* index_array_gpu,
void* workspace);
size_t calculateTotalWorkspaceSize(size_t* workspaces, int count);
pluginStatus_t permuteData(cudaStream_t stream, int nthreads, int num_classes,
int num_data, int num_dim, DataType DT_DATA,
bool confSigmoid, const void* data, void* new_data);
size_t detectionForwardPreNMSSize(int N, int C2);
size_t detectionForwardPostNMSSize(int N, int numClasses, int topK);
pluginStatus_t gatherNMSOutputs(cudaStream_t stream, bool shareLocation,
int numImages, int numPredsPerClass,
int numClasses, int topK, int keepTopK,
DataType DT_BBOX, DataType DT_SCORE,
const void* indices, const void* scores,
const void* bboxData, void* nmsedDets,
void* nmsedLabels, bool clipBoxes = true);
size_t detectionInferenceWorkspaceSize(bool shareLocation, int N, int C1,
int C2, int numClasses,
int numPredsPerClass, int topK,
DataType DT_BBOX, DataType DT_SCORE);
pluginStatus_t nmsInference(cudaStream_t stream, int N, int boxesSize,
int scoresSize, bool shareLocation,
int backgroundLabelId, int numPredsPerClass,
int numClasses, int topK, int keepTopK,
float scoreThreshold, float iouThreshold,
DataType DT_BBOX, const void* locData,
DataType DT_SCORE, const void* confData,
void* nmsedDets, void* nmsedLabels, void* workspace,
bool isNormalized = true, bool confSigmoid = false,
bool clipBoxes = true);
#endif