diff --git a/extras/mmdeploy/add_extra_steps.py b/extras/mmdeploy/add_extra_steps.py index f80ea77..d3ace0e 100644 --- a/extras/mmdeploy/add_extra_steps.py +++ b/extras/mmdeploy/add_extra_steps.py @@ -145,6 +145,64 @@ def add_steps_to_onnx(model_path): # Update the output's data type info output.type.tensor_type.elem_type = TensorProto.FLOAT + # Merge the two outputs + if "det" in model_path: + r1_output = "dets" + r2_output = "labels" + out_name = "bboxes" + out_dim = 6 + if "pose" in model_path: + r1_output = "kpts" + r2_output = "scores" + out_name = "keypoints" + out_dim = 3 + if "det" in model_path or "pose" in model_path: + # Node to expand + r2_expanded = r2_output + "_expanded" + unsqueeze_node = helper.make_node( + "Unsqueeze", + inputs=[r2_output], + outputs=[r2_expanded], + axes=[2], + name="Unsqueeze", + ) + + # Node to concatenate + r12_merged = out_name + concat_node = helper.make_node( + "Concat", + inputs=[r1_output, r2_expanded], + outputs=[r12_merged], + axis=2, + name="Merged", + ) + + # Define the new concatenated output + merged_output = helper.make_tensor_value_info( + r12_merged, + TensorProto.FLOAT, + [ + ( + graph.input[0].type.tensor_type.shape.dim[0].dim_value + if graph.input[0].type.tensor_type.shape.dim[0].dim_value > 0 + else None + ), + ( + graph.output[0].type.tensor_type.shape.dim[1].dim_value + if graph.output[0].type.tensor_type.shape.dim[1].dim_value > 0 + else None + ), + out_dim, + ], + ) + + # Update the graph + graph.node.append(unsqueeze_node) + graph.node.append(concat_node) + graph.output.pop() + graph.output.pop() + graph.output.append(merged_output) + path = re.sub(r"(x)(\d+)x(\d+)x(\d+)", r"\1\3x\4x\2", model_path) path = path.replace(".onnx", "_extra-steps.onnx") onnx.save(model, path) diff --git a/ros/rpt2D_wrapper_cpp/src/utils_2d_pose.hpp b/ros/rpt2D_wrapper_cpp/src/utils_2d_pose.hpp index 782a9f5..3b8a459 100644 --- a/ros/rpt2D_wrapper_cpp/src/utils_2d_pose.hpp +++ b/ros/rpt2D_wrapper_cpp/src/utils_2d_pose.hpp @@ -22,7 +22,7 @@ namespace utils_2d_pose explicit BaseModel(const std::string &model_path, int warmup_iterations); virtual ~BaseModel() = default; - std::vector call_by_image(const cv::Mat &img); + std::vector>> call_by_image(const cv::Mat &img); protected: static Ort::Env &get_env() @@ -200,7 +200,7 @@ namespace utils_2d_pose // ============================================================================================= - std::vector BaseModel::call_by_image(const cv::Mat &img) + std::vector>> BaseModel::call_by_image(const cv::Mat &img) { size_t height = img.rows; size_t width = img.cols; @@ -222,8 +222,38 @@ namespace utils_2d_pose shape.data(), shape.size()))); + // Call model auto outputs = call_model(input_tensors); - return outputs; + + // Get pointer to ouput tensor + const float *tensor_data = outputs[0].GetTensorData(); + auto data_info = outputs[0].GetTensorTypeAndShapeInfo(); + auto shape0 = data_info.GetShape(); + size_t B = (size_t)shape0[0]; + size_t N = (size_t)shape0[1]; + size_t C = (size_t)shape0[2]; + + // Convert to vector of values + std::vector>> data; + data.reserve(B); + for (size_t i = 0; i < B; i++) + { + std::vector> item; + item.reserve(N); + for (size_t j = 0; j < N; j++) + { + std::vector values; + values.reserve(C); + for (size_t k = 0; k < C; k++) + { + values.push_back(tensor_data[i * N * C + j * C + k]); + } + item.push_back(values); + } + data.push_back(item); + } + + return data; } // ============================================================================================= @@ -566,8 +596,9 @@ namespace utils_2d_pose std::unique_ptr letterbox; cv::Mat preprocess(const cv::Mat &image); - std::vector> postprocess(const std::vector &result, - const cv::Mat &image); + std::vector> postprocess( + const std::vector>> &result, + const cv::Mat &image); void clip_boxes(std::vector> &boxes, const cv::Mat &image) const; @@ -611,35 +642,21 @@ namespace utils_2d_pose // ============================================================================================= std::vector> RTMDet::postprocess( - const std::vector &result, + const std::vector>> &result, const cv::Mat &image) { - // // Expected output shapes: - // result[0] => shape [1, N, 5] => (x1,y1,x2,y2,score) - // result[1] => shape [1, N] => classes + // Expected result shape: [B, N, 6] => (x1,y1,x2,y2,score,class) - // Get pointer to boxes - const float *boxes_data = result[0].GetTensorData(); - const float *classes_data = result[1].GetTensorData(); - auto data_info = result[0].GetTensorTypeAndShapeInfo(); - auto shape0 = data_info.GetShape(); - if (shape0.size() != 3 || shape0[0] != 1 || shape0[2] != 5) - { - throw std::runtime_error("parse_outputs: unexpected shape for boxes"); - } - size_t N = (size_t)shape0[1]; - - // Extract human boxes + // Convert to vector of boxes std::vector> boxes; - boxes.reserve(N); - for (size_t i = 0; i < N; i++) + for (auto &item : result[0]) { - float x1 = boxes_data[i * 5 + 0]; - float y1 = boxes_data[i * 5 + 1]; - float x2 = boxes_data[i * 5 + 2]; - float y2 = boxes_data[i * 5 + 3]; - float score = boxes_data[i * 5 + 4]; - float cls = classes_data[i]; + float x1 = item[0]; + float y1 = item[1]; + float x2 = item[2]; + float y2 = item[3]; + float score = item[4]; + float cls = item[5]; if (cls == 0) { @@ -746,7 +763,7 @@ namespace utils_2d_pose cv::Mat preprocess(const cv::Mat &image, const std::vector> &bboxes); std::vector> postprocess( - const std::vector &result, + const std::vector>> &result, const cv::Mat &image, const std::vector> &bboxes); @@ -801,33 +818,20 @@ namespace utils_2d_pose // ============================================================================================= std::vector> RTMPose::postprocess( - const std::vector &result, + const std::vector>> &result, const cv::Mat &image, const std::vector> &bboxes) { - // // Expected output shapes: - // result[0] => shape [1, N, 2] => (x,y) - // result[1] => shape [1, N] => scores + // Expected result shape: [B, N, 3] => (x,y,score) - // Get pointer to boxes - const float *kpts_data = result[0].GetTensorData(); - const float *scores_data = result[1].GetTensorData(); - auto data_info = result[0].GetTensorTypeAndShapeInfo(); - auto shape0 = data_info.GetShape(); - if (shape0.size() != 3 || shape0[0] != 1 || shape0[2] != 2) - { - throw std::runtime_error("parse_outputs: unexpected shape for keypoints"); - } - size_t N = (size_t)shape0[1]; - - // Extract human keypoints + // Convert to vector of keypoints std::vector> kpts; - kpts.reserve(N); - for (size_t i = 0; i < N; i++) + for (auto &item : result[0]) { - float x = kpts_data[i * 2 + 0]; - float y = kpts_data[i * 2 + 1]; - float score = scores_data[i]; + float x = item[0]; + float y = item[1]; + float score = item[2]; + kpts.push_back({x, y, score}); } diff --git a/scripts/utils_2d_pose.py b/scripts/utils_2d_pose.py index 63eb89e..7b364e3 100644 --- a/scripts/utils_2d_pose.py +++ b/scripts/utils_2d_pose.py @@ -338,9 +338,8 @@ class RTMDet(BaseModel): def postprocess(self, result: List[np.ndarray], image: np.ndarray): boxes = np.squeeze(result[0], axis=0) - classes = np.squeeze(result[1], axis=0) - human_class = classes[:] == 0 + human_class = boxes[:, 5] == 0 boxes = boxes[human_class] keep = boxes[:, 4] > self.conf_threshold @@ -408,10 +407,7 @@ class RTMPose(BaseModel): ): kpts = [] for i in range(len(bboxes)): - scores = np.clip(result[1][i], 0, 1) - kp = np.concatenate( - [result[0][i], np.expand_dims(scores, axis=-1)], axis=-1 - ) + kp = result[0][i] paddings, scale, bbox, _ = self.boxcrop.calc_params(image.shape, bboxes[i]) kp[:, 0] -= paddings[0]