Batching of poses in cpp implementation.

2025-01-20 12:20:25 +01:00
parent 8f2322694a
commit 86d8ccf797
5 changed files with 206 additions and 117 deletions
--- a/ros/README.md
+++ b/ros/README.md
@ -24,7 +24,7 @@ Run pose estimator with ros topics as inputs and publish detected poses.
 ```bash
 cd /RapidPoseTriangulation/ros/rpt2D_wrapper_cpp/tests/

-g++ -std=c++17 -O3 -march=native -DCOMPILE_EXAMPLE_MAIN \
+g++ -std=c++17 -O3 -march=native -Wall -DCOMPILE_EXAMPLE_MAIN \
    $(pkg-config --cflags opencv4) \
    -I /onnxruntime/include \
    -I /onnxruntime/include/onnxruntime/core/session \
--- a/ros/rpt2D_wrapper_cpp/src/rpt2D_wrapper.cpp
+++ b/ros/rpt2D_wrapper_cpp/src/rpt2D_wrapper.cpp
@ -32,7 +32,7 @@ static const std::string pose_out_topic = "/poses/" + cam_id;

 static const float min_bbox_score = 0.4;
 static const float min_bbox_area = 0.1 * 0.1;
-static const bool batch_poses = false;
+static const bool batch_poses = true;

 // =================================================================================================
 // =================================================================================================
--- a/ros/rpt2D_wrapper_cpp/src/utils_2d_pose.hpp
+++ b/ros/rpt2D_wrapper_cpp/src/utils_2d_pose.hpp
@ -22,7 +22,8 @@ namespace utils_2d_pose
        explicit BaseModel(const std::string &model_path, int warmup_iterations);
        virtual ~BaseModel() = default;

-        std::vector<std::vector<std::vector<float>>> call_by_image(const cv::Mat &img);
+        std::vector<std::vector<std::vector<float>>> call_by_image(
+            const std::vector<cv::Mat> &images);

    protected:
        static Ort::Env &get_env()
@ -157,6 +158,8 @@ namespace utils_2d_pose

    void BaseModel::warmup(int epoch)
    {
+        std::vector<int> use_batch_sizes = {4, 10, 5, 1, 9, 3, 7, 2, 8, 4, 6, 1, 3, 7, 2};
+
        std::cout << "Warmup: ";
        for (int e = 0; e < epoch; e++)
        {
@ -170,28 +173,32 @@ namespace utils_2d_pose
            // We'll only handle batch=1 in this example
            if (batch_size != 1)
            {
-                throw std::runtime_error("Warmup only supports batch=1 for demonstration.");
+                batch_size = use_batch_sizes[e % use_batch_sizes.size()];
            }

-            // Create a multi-channel Mat of shape (Height, Width, Channels)
-            cv::Mat img(height, width, CV_8UC(channels));
+            std::vector<cv::Mat> images;
+            for (int i = 0; i < batch_size; i++)
+            {
+                // Create a multi-channel Mat of shape (Height, Width, Channels)
+                cv::Mat img(height, width, CV_8UC(channels));
+                if (channels == 1)
+                {
+                    cv::randu(img, 0, 256);
+                }
+                else if (channels == 3)
+                {
+                    cv::randu(img, cv::Scalar(0, 0, 0), cv::Scalar(256, 256, 256));
+                }
+                else
+                {
+                    throw std::runtime_error("Unsupported number of channels for warmup.");
+                }

-            // For multi-channel images, randu requires matching channels in low/high
-            if (channels == 1)
-            {
-                cv::randu(img, 0, 256);
-            }
-            else if (channels == 3)
-            {
-                cv::randu(img, cv::Scalar(0, 0, 0), cv::Scalar(256, 256, 256));
-            }
-            else
-            {
-                throw std::runtime_error("Unsupported number of channels for warmup.");
+                images.push_back(std::move(img));
            }

            // Call the model
-            auto outputs = call_by_image(img);
+            auto outputs = call_by_image(images);
            std::cout << "#";
        }

@ -200,29 +207,46 @@ namespace utils_2d_pose

    // =============================================================================================

-    std::vector<std::vector<std::vector<float>>> BaseModel::call_by_image(const cv::Mat &img)
+    std::vector<std::vector<std::vector<float>>> BaseModel::call_by_image(
+        const std::vector<cv::Mat> &images)
    {
-        size_t height = img.rows;
-        size_t width = img.cols;
-        size_t channels = img.channels();
+        size_t batch_size = images.size();
+        int height = images[0].rows;
+        int width = images[0].cols;
+        int channels = images[0].channels();

-        // Create a vector of Ort::Value for the input tensors
-        std::vector<Ort::Value> input_tensors;
-        input_tensors.reserve(input_names.size());
+        // Flatten into single vector
+        size_t single_img_size = height * width * channels;
+        size_t total_elems = batch_size * single_img_size;
+        std::vector<uint8_t> batch_data(total_elems);
+        for (size_t b = 0; b < batch_size; b++)
+        {
+            auto &img = images[b];
+            if (img.rows != height || img.cols != width || img.channels() != channels)
+            {
+                throw std::runtime_error("All images must have the same dimensions!");
+            }
+            std::memcpy(batch_data.data() + b * single_img_size, img.data, single_img_size);
+        }

-        // Flatten and create an ONNX tensor
-        size_t total_elems = static_cast<size_t>(height * width * channels);
+        // Create an onnx tensor
        auto mem_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
-        auto shape = input_shapes[0];
-        input_tensors.push_back(
-            std::move(Ort::Value::CreateTensor<uint8_t>(
-                mem_info,
-                img.data,
-                total_elems,
-                shape.data(),
-                shape.size())));
+        std::vector<int64_t> shape = {
+            static_cast<int64_t>(batch_size),
+            static_cast<int64_t>(height),
+            static_cast<int64_t>(width),
+            static_cast<int64_t>(channels),
+        };
+        Ort::Value input_tensor = Ort::Value::CreateTensor<uint8_t>(
+            mem_info,
+            batch_data.data(),
+            total_elems,
+            shape.data(),
+            shape.size());

        // Call model
+        std::vector<Ort::Value> input_tensors;
+        input_tensors.push_back(std::move(input_tensor));
        auto outputs = call_model(input_tensors);

        // Get pointer to ouput tensor
@ -612,18 +636,19 @@ namespace utils_2d_pose
        std::chrono::duration<float> elapsed;

        cv::Mat preprocessed = preprocess(image);
+        std::vector<cv::Mat> inputs = {preprocessed};

        elapsed = std::chrono::high_resolution_clock::now() - stime;
        std::cout << "Preprocess time: " << elapsed.count() << "s\n";
        stime = std::chrono::high_resolution_clock::now();

-        auto inputs = call_by_image(preprocessed);
+        auto results = call_by_image(inputs);

        elapsed = std::chrono::high_resolution_clock::now() - stime;
        std::cout << "Inference time: " << elapsed.count() << "s\n";
        stime = std::chrono::high_resolution_clock::now();

-        auto outputs = postprocess(inputs, image);
+        auto outputs = postprocess(results, image);

        elapsed = std::chrono::high_resolution_clock::now() - stime;
        std::cout << "Postprocess time: " << elapsed.count() << "s\n";
@ -754,45 +779,47 @@ namespace utils_2d_pose
            this->boxcrop = std::make_unique<BoxCrop>(target_size, 1.25f, 114);
        }

-        std::vector<std::array<float, 3>> call(const cv::Mat &image,
-                                               const std::vector<std::array<float, 5>> &bboxes);
+        std::vector<std::vector<std::array<float, 3>>> call(const cv::Mat &image,
+                                                            const std::vector<std::array<float, 5>> &bboxes);

    private:
        std::array<size_t, 2> target_size;
        std::unique_ptr<BoxCrop> boxcrop;

-        cv::Mat preprocess(const cv::Mat &image, const std::vector<std::array<float, 5>> &bboxes);
-        std::vector<std::array<float, 3>> postprocess(
+        std::vector<cv::Mat> preprocess(
+            const cv::Mat &image,
+            const std::vector<std::array<float, 5>> &bboxes);
+        std::vector<std::vector<std::array<float, 3>>> postprocess(
            const std::vector<std::vector<std::vector<float>>> &result,
            const cv::Mat &image,
            const std::vector<std::array<float, 5>> &bboxes);

        void clip_keypoints(std::vector<std::array<float, 3>> &kp,
                            const cv::Mat &image,
-                            const std::vector<std::array<float, 5>> &bboxes) const;
+                            const std::array<float, 5> &bbox) const;
    };

    // =============================================================================================

-    std::vector<std::array<float, 3>> RTMPose::call(
+    std::vector<std::vector<std::array<float, 3>>> RTMPose::call(
        const cv::Mat &image, const std::vector<std::array<float, 5>> &bboxes)
    {
        auto stime = std::chrono::high_resolution_clock::now();
        std::chrono::duration<float> elapsed;

-        cv::Mat preprocessed = preprocess(image, bboxes);
+        std::vector<cv::Mat> inputs = preprocess(image, bboxes);

        elapsed = std::chrono::high_resolution_clock::now() - stime;
        std::cout << "Preprocess time: " << elapsed.count() << "s\n";
        stime = std::chrono::high_resolution_clock::now();

-        auto inputs = call_by_image(preprocessed);
+        auto results = call_by_image(inputs);

        elapsed = std::chrono::high_resolution_clock::now() - stime;
        std::cout << "Inference time: " << elapsed.count() << "s\n";
        stime = std::chrono::high_resolution_clock::now();

-        auto outputs = postprocess(inputs, image, bboxes);
+        auto outputs = postprocess(results, image, bboxes);

        elapsed = std::chrono::high_resolution_clock::now() - stime;
        std::cout << "Postprocess time: " << elapsed.count() << "s\n";
@ -802,62 +829,64 @@ namespace utils_2d_pose

    // =============================================================================================

-    cv::Mat RTMPose::preprocess(
+    std::vector<cv::Mat> RTMPose::preprocess(
        const cv::Mat &image, const std::vector<std::array<float, 5>> &bboxes)
    {
-        // Assert only one box used
-        if (bboxes.size() != 1)
+        std::vector<cv::Mat> crops;
+        for (auto &bbox : bboxes)
        {
-            throw std::runtime_error("Only one box is supported!");
+            cv::Mat cropped = boxcrop->crop_resize_box(image, bbox);
+            crops.push_back(std::move(cropped));
        }

-        cv::Mat cropped = boxcrop->crop_resize_box(image, bboxes[0]);
-        return cropped;
+        return crops;
    }

    // =============================================================================================

-    std::vector<std::array<float, 3>> RTMPose::postprocess(
+    std::vector<std::vector<std::array<float, 3>>> RTMPose::postprocess(
        const std::vector<std::vector<std::vector<float>>> &result,
        const cv::Mat &image,
        const std::vector<std::array<float, 5>> &bboxes)
    {
        // Expected result shape: [B, N, 3] => (x,y,score)
-
-        // Convert to vector of keypoints
-        std::vector<std::array<float, 3>> kpts;
-        for (auto &item : result[0])
+        std::vector<std::vector<std::array<float, 3>>> keypoints;
+        for (size_t i = 0; i < result.size(); i++)
        {
-            float x = item[0];
-            float y = item[1];
-            float score = item[2];
+            // Convert to vector of keypoints
+            std::vector<std::array<float, 3>> kpts;
+            for (auto &item : result[i])
+            {
+                float x = item[0];
+                float y = item[1];
+                float score = item[2];

-            kpts.push_back({x, y, score});
+                kpts.push_back({x, y, score});
+            }
+
+            if (!kpts.empty())
+            {
+                // Shift by boxcrop padding and scale back
+                clip_keypoints(kpts, image, bboxes[i]);
+                keypoints.push_back(std::move(kpts));
+            }
        }

-        if (kpts.empty())
-        {
-            return {};
-        }
-
-        // Shift by boxcrop padding and scale back
-        clip_keypoints(kpts, image, bboxes);
-
-        return kpts;
+        return keypoints;
    }

    // =============================================================================================

    void RTMPose::clip_keypoints(std::vector<std::array<float, 3>> &kpts,
                                 const cv::Mat &image,
-                                 const std::vector<std::array<float, 5>> &bboxes) const
+                                 const std::array<float, 5> &bbox) const
    {
        // Get paddings, scale from boxcrop
-        auto [paddings, scale, bbox, _] = boxcrop->calc_params(image, bboxes[0]);
+        auto [paddings, scale, box, _] = boxcrop->calc_params(image, bbox);
        int pad_left = paddings[0];
        int pad_top = paddings[2];
-        int box_left = bbox[0];
-        int box_top = bbox[1];
+        int box_left = box[0];
+        int box_top = box[1];
        int img_w = image.cols;
        int img_h = image.rows;

@ -917,10 +946,20 @@ namespace utils_2d_pose
        auto bboxes = det_model->call(image);

        std::vector<std::vector<std::array<float, 3>>> keypoints;
-        for (auto &bbox : bboxes)
+        if (this->batch_poses)
        {
-            auto kpts = pose_model->call(image, {bbox});
-            keypoints.push_back(std::move(kpts));
+            if (!bboxes.empty())
+            {
+                keypoints = std::move(pose_model->call(image, bboxes));
+            }
+        }
+        else
+        {
+            for (size_t i = 0; i < bboxes.size(); i++)
+            {
+                auto kpts = pose_model->call(image, {bboxes[i]});
+                keypoints.push_back(std::move(kpts[0]));
+            }
        }
        return keypoints;
    }
@ -937,18 +976,43 @@ namespace utils_2d_pose
                      bool batch_poses = false)
        {
            std::string base_path = "/RapidPoseTriangulation/extras/mmdeploy/exports/";
-            std::string model_det = base_path + "rtmdet-nano_1x320x320x3_fp16_extra-steps.onnx";
-            std::string model_pose1 = base_path + "rtmpose-m_1x384x288x3_fp16_extra-steps.onnx";
-            std::string model_pose2 = base_path + "rtmpose-l_wb_1x384x288x3_fp16_extra-steps.onnx";
+            std::string path_det = base_path + "rtmdet-nano_1x320x320x3_fp16_extra-steps.onnx";
+
+            std::string path_pose_m1 = base_path + "rtmpose-m_1x384x288x3_fp16_extra-steps.onnx";
+            std::string path_pose_mb = base_path + "rtmpose-m_Bx384x288x3_fp16_extra-steps.onnx";
+            std::string path_pose_w1 = base_path + "rtmpose-l_wb_1x384x288x3_fp16_extra-steps.onnx";
+            std::string path_pose_wb = base_path + "rtmpose-l_wb_Bx384x288x3_fp16_extra-steps.onnx";

-            std::string model_pose = whole_body ? model_pose2 : model_pose1;
            this->num_joints = whole_body ? 133 : 17;
+            std::string path_pose;
+            if (!whole_body)
+            {
+                if (!batch_poses)
+                {
+                    path_pose = path_pose_m1;
+                }
+                else
+                {
+                    path_pose = path_pose_mb;
+                }
+            }
+            else
+            {
+                if (!batch_poses)
+                {
+                    path_pose = path_pose_w1;
+                }
+                else
+                {
+                    path_pose = path_pose_wb;
+                }
+            }

            std::cout << "Loading 2D" + std::string(whole_body ? "-WB" : "") + " models ..."
                      << std::endl;

            this->topdown_model = std::make_unique<TopDown>(
-                model_det, model_pose, min_bbox_score, min_bbox_area, batch_poses, 30);
+                path_det, path_pose, min_bbox_score, min_bbox_area, batch_poses, 30);

            std::cout << "Loaded models \n"
                      << std::endl;
--- a/ros/rpt2D_wrapper_cpp/tests/my_app.cpp
+++ b/ros/rpt2D_wrapper_cpp/tests/my_app.cpp
@ -25,49 +25,72 @@ int main(int argc, char **argv)
        "/RapidPoseTriangulation/data/h1/60457274-img_003201.jpg",
    };

-    // RTMDet model1(model_path1, 0.3, 0.1 * 0.1, 30);
-    // RTMPose model2(model_path2, 30);
-    // for (size_t i = 0; i < img_paths.size(); i++)
    // {
-    //     cv::Mat img = cv::imread(img_paths[i]);
-    //     cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
-
-    //     auto outputs1 = model1.call(img);
-    //     std::cout << "Model1 outputs: " << outputs1[0][0] << " " << outputs1[0][1] << " "
-    //               << outputs1[0][2] << " " << outputs1[0][3] << " " << outputs1[0][4] << " "
-    //               << std::endl;
-
-    //     for (size_t j = 0; j < outputs1.size(); j++)
+    //     std::cout << "\nTesting RTMDet and RTMPose" << std::endl;
+    //     RTMDet model1(model_path1, 0.3, 0.1 * 0.1, 30);
+    //     RTMPose model2(model_path2, 30);
+    //     for (size_t i = 0; i < img_paths.size(); i++)
    //     {
-    //         std::vector<std::array<float, 5>> bboxes = {outputs1[j]};
-    //         auto outputs2 = model2.call(img, bboxes);
-    //         std::cout << "Model2 outputs: " << outputs2[0][0] << " " << outputs2[0][1] << " "
-    //                   << outputs2[0][2] << " " << std::endl;
+    //         cv::Mat img = cv::imread(img_paths[i]);
+    //         cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
+
+    //         auto outputs1 = model1.call(img);
+    //         std::cout << "Model1 outputs: " << outputs1[0][0] << " " << outputs1[0][1] << " "
+    //                   << outputs1[0][2] << " " << outputs1[0][3] << " " << outputs1[0][4] << " "
+    //                   << std::endl;
+
+    //         for (size_t j = 0; j < outputs1.size(); j++)
+    //         {
+    //             std::vector<std::array<float, 5>> bboxes = {outputs1[j]};
+    //             auto outputs2 = model2.call(img, bboxes);
+    //             std::cout << "Model2 outputs: " << outputs2[0][0][0] << " "
+    //                       << outputs2[0][0][1] << " " << outputs2[0][0][2] << " " << std::endl;
+    //         }
    //     }
    // }

-    // TopDown model3(model_path1, model_path2, 0.3, 0.1 * 0.1, 30);
-    // for (size_t i = 0; i < img_paths.size(); i++)
    // {
-    //     cv::Mat img = cv::imread(img_paths[i]);
-    //     cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
+    //     std::cout << "\nTesting TopDown" << std::endl;
+    //     TopDown model3(model_path1, model_path2, 0.3, 0.1 * 0.1, false, 30);
+    //     for (size_t i = 0; i < img_paths.size(); i++)
+    //     {
+    //         cv::Mat img = cv::imread(img_paths[i]);
+    //         cv::cvtColor(img, img, cv::COLOR_BGR2RGB);

-    //     auto outputs3 = model3.predict(img);
-    //     std::cout << "Model3 outputs: " << outputs3[0][0][0] << " " << outputs3[0][0][1] << " "
-    //               << outputs3[0][0][2] << " " << std::endl;
+    //         auto outputs3 = model3.predict(img);
+    //         std::cout << "Model3 outputs: " << outputs3[0][0][0] << " "
+    //                   << outputs3[0][0][1] << " " << outputs3[0][0][2] << " " << std::endl;
+    //     }
    // }

-    PosePredictor model4(false, 0.3, 0.1 * 0.1, false);
-    std::vector<cv::Mat> images;
-    for (size_t i = 0; i < img_paths.size(); i++)
    {
-        cv::Mat img = cv::imread(img_paths[i]);
-        cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
-        images.push_back(img);
+        std::cout << "\nTesting PosePredictor 1" << std::endl;
+        PosePredictor model4(false, 0.3, 0.1 * 0.1, false);
+        std::vector<cv::Mat> images;
+        for (size_t i = 0; i < img_paths.size(); i++)
+        {
+            cv::Mat img = cv::imread(img_paths[i]);
+            cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
+            images.push_back(img);
+        }
+        auto outputs4 = model4.predict(images);
+        std::cout << "Model4 outputs: " << outputs4[0][0][0][0] << " "
+                  << outputs4[0][0][0][1] << " " << outputs4[0][0][0][2] << " " << std::endl;
    }
-    auto outputs4 = model4.predict(images);
-    std::cout << "Model4 outputs: " << outputs4[0][0][0][0] << " " << outputs4[0][0][0][1] << " "
-              << outputs4[0][0][0][2] << " " << std::endl;

+    {
+        std::cout << "\nTesting PosePredictor 2" << std::endl;
+        PosePredictor model5(false, 0.3, 0.1 * 0.1, true);
+        std::vector<cv::Mat> images;
+        for (size_t i = 0; i < img_paths.size(); i++)
+        {
+            cv::Mat img = cv::imread(img_paths[i]);
+            cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
+            images.push_back(img);
+        }
+        auto outputs5 = model5.predict(images);
+        std::cout << "Model5 outputs: " << outputs5[0][0][0][0] << " "
+                  << outputs5[0][0][0][1] << " " << outputs5[0][0][0][2] << " " << std::endl;
+    }
    return 0;
 }
--- a/ros/rpt2D_wrapper_py/rpt2D_wrapper_py/rpt2D_wrapper.py
+++ b/ros/rpt2D_wrapper_py/rpt2D_wrapper_py/rpt2D_wrapper.py
@ -37,7 +37,7 @@ stop_flag = False
 # Model config
 min_bbox_score = 0.4
 min_bbox_area = 0.1 * 0.1
-batch_poses = False
+batch_poses = True


 # ==================================================================================================
@ -169,7 +169,9 @@ def main():
    # Load 2D pose model
    whole_body = test_triangulate.whole_body
    if any((whole_body[k] for k in whole_body)):
-        kpt_model = utils_2d_pose.load_wb_model()
+        kpt_model = utils_2d_pose.load_wb_model(
+            min_bbox_score, min_bbox_area, batch_poses
+        )
    else:
        kpt_model = utils_2d_pose.load_model(min_bbox_score, min_bbox_area, batch_poses)