From c5f190ab35cac6c4ae7d14427656f89a77c1b242 Mon Sep 17 00:00:00 2001 From: Daniel Date: Tue, 21 Jan 2025 15:10:43 +0100 Subject: [PATCH] Eval skelda datasets with cpp implementation. --- README.md | 21 + scripts/.gitignore | 1 + scripts/test_skelda_dataset_cpp.cpp | 266 ++++++++++++ scripts/test_skelda_dataset_cpp.py | 395 ++++++++++++++++++ ...a_dataset.py => test_skelda_dataset_py.py} | 0 5 files changed, 683 insertions(+) create mode 100644 scripts/.gitignore create mode 100644 scripts/test_skelda_dataset_cpp.cpp create mode 100644 scripts/test_skelda_dataset_cpp.py rename scripts/{test_skelda_dataset.py => test_skelda_dataset_py.py} (100%) diff --git a/README.md b/README.md index 2a010bf..809c85b 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,27 @@ Fast triangulation of multiple persons from multiple camera views. - Build triangulator: ```bash cd /RapidPoseTriangulation/swig/ && make all && cd ../tests/ && python3 test_interface.py && cd .. + + cd /RapidPoseTriangulation/scripts/ && \ + g++ -std=c++17 -fPIC -O3 -march=native -Wall -Werror -flto=auto -fopenmp -fopenmp-simd \ + $(pkg-config --cflags opencv4) \ + -I /RapidPoseTriangulation/rpt/ \ + -I /onnxruntime/include/ \ + -I /onnxruntime/include/onnxruntime/core/session/ \ + -I /onnxruntime/include/onnxruntime/core/providers/tensorrt/ \ + -L /onnxruntime/build/Linux/Release/ \ + test_skelda_dataset_cpp.cpp \ + /RapidPoseTriangulation/rpt/*.cpp \ + -o test_skelda_dataset \ + -Wl,--start-group \ + -lonnxruntime_providers_tensorrt \ + -lonnxruntime_providers_shared \ + -lonnxruntime_providers_cuda \ + -lonnxruntime \ + -Wl,--end-group \ + $(pkg-config --libs opencv4) \ + -Wl,-rpath,/onnxruntime/build/Linux/Release/ \ + && cd .. ``` - Test with samples: diff --git a/scripts/.gitignore b/scripts/.gitignore new file mode 100644 index 0000000..efd6aff --- /dev/null +++ b/scripts/.gitignore @@ -0,0 +1 @@ +test_skelda_dataset diff --git a/scripts/test_skelda_dataset_cpp.cpp b/scripts/test_skelda_dataset_cpp.cpp new file mode 100644 index 0000000..7f45977 --- /dev/null +++ b/scripts/test_skelda_dataset_cpp.cpp @@ -0,0 +1,266 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// OpenCV +#include + +// JSON library +#include "/RapidPoseTriangulation/extras/include/nlohmann/json.hpp" +using json = nlohmann::json; + +#include "/RapidPoseTriangulation/scripts/utils_pipeline.hpp" +#include "/RapidPoseTriangulation/scripts/utils_2d_pose.hpp" +#include "/RapidPoseTriangulation/rpt/interface.hpp" +#include "/RapidPoseTriangulation/rpt/camera.hpp" + +// ================================================================================================= + +static const std::string path_data = "/tmp/rpt/all.json"; +static const std::string path_cfg = "/tmp/rpt/config.json"; + +// ================================================================================================= + +std::vector load_images(json &item) +{ + // Load images + std::vector images; + for (size_t j = 0; j < item["imgpaths"].size(); j++) + { + auto ipath = item["imgpaths"][j].get(); + cv::Mat image = cv::imread(ipath, cv::IMREAD_COLOR); + cv::cvtColor(image, image, cv::COLOR_BGR2RGB); + images.push_back(image); + } + + if (item["dataset_name"] == "human36m") + { + // Since the images don't have the same shape, rescale some of them + for (size_t i = 0; i < images.size(); i++) + { + cv::Mat &img = images[i]; + cv::Size ishape = img.size(); + if (ishape != cv::Size(1000, 1000)) + { + auto cam = item["cameras"][i]; + cam["K"][1][1] = cam["K"][1][1].get() * (1000.0 / ishape.height); + cam["K"][1][2] = cam["K"][1][2].get() * (1000.0 / ishape.height); + cam["K"][0][0] = cam["K"][0][0].get() * (1000.0 / ishape.width); + cam["K"][0][2] = cam["K"][0][2].get() * (1000.0 / ishape.width); + cv::resize(img, img, cv::Size(1000, 1000)); + images[i] = img; + } + } + } + + // Convert image format to Bayer encoding to simulate real camera input + // This also resulted in notably better MPJPE results in most cases, presumbly since the + // demosaicing algorithm from OpenCV is better than the default one from the cameras + for (size_t i = 0; i < images.size(); i++) + { + cv::Mat &img = images[i]; + cv::Mat bayer_image = utils_pipeline::rgb2bayer(img); + images[i] = std::move(bayer_image); + } + + return images; +} + +// ================================================================================================= + +std::string read_file(const std::string &path) +{ + std::ifstream file_stream(path); + if (!file_stream.is_open()) + { + throw std::runtime_error("Unable to open file: " + path); + } + + std::stringstream buffer; + buffer << file_stream.rdbuf(); + return buffer.str(); +} + +void write_file(const std::string &path, const std::string &content) +{ + std::ofstream file_stream(path, std::ios::out | std::ios::binary); + if (!file_stream.is_open()) + { + throw std::runtime_error("Unable to open file for writing: " + path); + } + + file_stream << content; + + if (!file_stream) + { + throw std::runtime_error("Error occurred while writing to file: " + path); + } + file_stream.close(); +} + +// ================================================================================================= + +int main(int argc, char **argv) +{ + // Load the files + auto dataset = json::parse(read_file(path_data)); + auto config = json::parse(read_file(path_cfg)); + + // Load the configuration + const std::map whole_body = config["whole_body"]; + const float min_bbox_score = config["min_bbox_score"]; + const float min_bbox_area = config["min_bbox_area"]; + const bool batch_poses = config["batch_poses"]; + const std::vector joint_names_2d = utils_pipeline::get_joint_names(whole_body); + const float min_match_score = config["min_match_score"]; + const size_t min_group_size = config["min_group_size"]; + const int take_interval = config["take_interval"]; + + // Load 2D model + bool use_wb = utils_pipeline::use_whole_body(whole_body); + std::unique_ptr kpt_model = + std::make_unique( + use_wb, min_bbox_score, min_bbox_area, batch_poses); + + // Load 3D model + std::unique_ptr tri_model = std::make_unique( + min_match_score, min_group_size); + + // Timers + size_t time_count = dataset.size(); + double time_image = 0.0; + double time_pose2d = 0.0; + double time_pose3d = 0.0; + size_t print_steps = (size_t)std::floor((float)time_count / 100.0f); + + std::cout << "Running predictions: |"; + size_t bar_width = (size_t)std::ceil((float)time_count / (float)print_steps) - 2; + for (size_t i = 0; i < bar_width; i++) + { + std::cout << "-"; + } + std::cout << "|" << std::endl; + + // Calculate 2D poses [items, views, persons, joints, 3] + std::vector>>>> all_poses_2d; + std::cout << "Calculating 2D poses: "; + for (size_t i = 0; i < dataset.size(); i++) + { + if (i % print_steps == 0) + { + std::cout << "#" << std::flush; + } + std::chrono::duration elapsed; + auto &item = dataset[i]; + + // Load images + auto stime = std::chrono::high_resolution_clock::now(); + std::vector images = load_images(item); + elapsed = std::chrono::high_resolution_clock::now() - stime; + time_image += elapsed.count(); + + // Predict 2D poses + stime = std::chrono::high_resolution_clock::now(); + for (size_t i = 0; i < images.size(); i++) + { + cv::Mat &img = images[i]; + cv::Mat rgb = utils_pipeline::bayer2rgb(img); + images[i] = std::move(rgb); + } + auto poses_2d_all = kpt_model->predict(images); + auto poses_2d_upd = utils_pipeline::update_keypoints( + poses_2d_all, joint_names_2d, whole_body); + elapsed = std::chrono::high_resolution_clock::now() - stime; + time_pose2d += elapsed.count(); + + all_poses_2d.push_back(std::move(poses_2d_upd)); + } + std::cout << std::endl; + + // Calculate 3D poses [items, persons, joints, 4] + std::vector>>> all_poses_3d; + std::vector all_ids; + std::string old_scene = ""; + int old_id = -1; + std::cout << "Calculating 3D poses: "; + for (size_t i = 0; i < dataset.size(); i++) + { + if (i % print_steps == 0) + { + std::cout << "#" << std::flush; + } + std::chrono::duration elapsed; + auto &item = dataset[i]; + auto &poses_2d = all_poses_2d[i]; + + if (old_scene != item["scene"] || old_id + take_interval < item["index"]) + { + // Reset last poses if scene changes + tri_model->reset(); + old_scene = item["scene"]; + } + + auto stime = std::chrono::high_resolution_clock::now(); + std::vector cameras; + for (size_t j = 0; j < item["cameras"].size(); j++) + { + auto &cam = item["cameras"][j]; + Camera camera; + camera.name = cam["name"].get(); + camera.K = cam["K"].get, 3>>(); + camera.DC = cam["DC"].get>(); + camera.R = cam["R"].get, 3>>(); + camera.T = cam["T"].get, 3>>(); + camera.width = cam["width"].get(); + camera.height = cam["height"].get(); + camera.type = cam["type"].get(); + cameras.push_back(camera); + } + std::array, 2> roomparams = { + item["room_size"].get>(), + item["room_center"].get>()}; + + auto poses_3d = tri_model->triangulate_poses(poses_2d, cameras, roomparams, joint_names_2d); + elapsed = std::chrono::high_resolution_clock::now() - stime; + time_pose3d += elapsed.count(); + + all_poses_3d.push_back(std::move(poses_3d)); + all_ids.push_back(item["id"].get()); + } + std::cout << std::endl; + + // Print timing stats + std::cout << "\nMetrics:" << std::endl; + tri_model->print_stats(); + size_t warmup = 10; + double avg_time_image = time_image / (time_count - warmup); + double avg_time_pose2d = time_pose2d / (time_count - warmup); + double avg_time_pose3d = time_pose3d / (time_count - warmup); + double fps = 1.0 / (avg_time_pose2d + avg_time_pose3d); + std::cout << "{\n" + << " \"img_loading\": " << avg_time_image << ",\n" + << " \"avg_time_2d\": " << avg_time_pose2d << ",\n" + << " \"avg_time_3d\": " << avg_time_pose3d << ",\n" + << " \"fps\": " << fps << "\n" + << "}" << std::endl; + + // Store the results as json + json all_results; + all_results["all_ids"] = all_ids; + all_results["all_poses_2d"] = all_poses_2d; + all_results["all_poses_3d"] = all_poses_3d; + all_results["joint_names_2d"] = joint_names_2d; + all_results["joint_names_3d"] = joint_names_2d; + + // Save the results + std::string path_results = "/tmp/rpt/results.json"; + write_file(path_results, all_results.dump(0)); + + return 0; +} \ No newline at end of file diff --git a/scripts/test_skelda_dataset_cpp.py b/scripts/test_skelda_dataset_cpp.py new file mode 100644 index 0000000..1fbdf9d --- /dev/null +++ b/scripts/test_skelda_dataset_cpp.py @@ -0,0 +1,395 @@ +import json +import os + +import utils_pipeline +from skelda import evals +from skelda.writers import json_writer + +# ================================================================================================== + +whole_body = { + "foots": False, + "face": False, + "hands": False, +} + +dataset_use = "human36m" +# dataset_use = "panoptic" +# dataset_use = "mvor" +# dataset_use = "shelf" +# dataset_use = "campus" +# dataset_use = "ikeaasm" +# dataset_use = "chi3d" +# dataset_use = "tsinghua" +# dataset_use = "human36m_wb" +# dataset_use = "egohumans_tagging" +# dataset_use = "egohumans_legoassemble" +# dataset_use = "egohumans_fencing" +# dataset_use = "egohumans_basketball" +# dataset_use = "egohumans_volleyball" +# dataset_use = "egohumans_badminton" +# dataset_use = "egohumans_tennis" + + +# Describes the minimum area as fraction of the image size for a 2D bounding box to be considered +# If the persons are small in the image, use a lower value +default_min_bbox_area = 0.1 * 0.1 + +# Describes how confident a 2D bounding box needs to be to be considered +# If the persons are small in the image, or poorly recognizable, use a lower value +default_min_bbox_score = 0.3 + +# Describes how good two 2D poses need to match each other to create a valid triangulation +# If the quality of the 2D detections is poor, use a lower value +default_min_match_score = 0.94 + +# Describes the minimum number of camera pairs that need to detect the same person +# If the number of cameras is high, and the views are not occluded, use a higher value +default_min_group_size = 1 + +# Batch poses per image for faster processing +# If most of the time only one person is in a image, disable it, because it is slightly slower then +default_batch_poses = True + +datasets = { + "human36m": { + "path": "/datasets/human36m/skelda/pose_test.json", + "take_interval": 5, + "min_match_score": 0.95, + "min_group_size": 1, + "min_bbox_score": 0.4, + "min_bbox_area": 0.1 * 0.1, + "batch_poses": False, + }, + "panoptic": { + "path": "/datasets/panoptic/skelda/test.json", + "cams": ["00_03", "00_06", "00_12", "00_13", "00_23"], + # "cams": ["00_03", "00_06", "00_12"], + # "cams": ["00_03", "00_06", "00_12", "00_13", "00_23", "00_15", "00_10", "00_21", "00_09", "00_01"], + "take_interval": 3, + "min_match_score": 0.95, + "use_scenes": ["160906_pizza1", "160422_haggling1", "160906_ian5"], + "min_group_size": 1, + # "min_group_size": 4, + "min_bbox_area": 0.05 * 0.05, + }, + "mvor": { + "path": "/datasets/mvor/skelda/all.json", + "take_interval": 1, + "with_depth": False, + "min_match_score": 0.85, + "min_bbox_score": 0.25, + }, + "campus": { + "path": "/datasets/campus/skelda/test.json", + "take_interval": 1, + "min_match_score": 0.90, + "min_bbox_score": 0.5, + }, + "shelf": { + "path": "/datasets/shelf/skelda/test.json", + "take_interval": 1, + "min_match_score": 0.96, + "min_group_size": 2, + }, + "ikeaasm": { + "path": "/datasets/ikeaasm/skelda/test.json", + "take_interval": 2, + "min_match_score": 0.92, + "min_bbox_score": 0.20, + }, + "chi3d": { + "path": "/datasets/chi3d/skelda/all.json", + "take_interval": 5, + }, + "tsinghua": { + "path": "/datasets/tsinghua/skelda/test.json", + "take_interval": 3, + "min_match_score": 0.95, + "min_group_size": 2, + }, + "human36m_wb": { + "path": "/datasets/human36m/skelda/wb/test.json", + "take_interval": 100, + "min_bbox_score": 0.4, + "batch_poses": False, + }, + "egohumans_tagging": { + "path": "/datasets/egohumans/skelda/all.json", + "take_interval": 2, + "subset": "tagging", + "min_group_size": 2, + "min_bbox_score": 0.2, + "min_bbox_area": 0.05 * 0.05, + }, + "egohumans_legoassemble": { + "path": "/datasets/egohumans/skelda/all.json", + "take_interval": 2, + "subset": "legoassemble", + "min_group_size": 2, + }, + "egohumans_fencing": { + "path": "/datasets/egohumans/skelda/all.json", + "take_interval": 2, + "subset": "fencing", + "min_group_size": 7, + "min_bbox_score": 0.5, + "min_bbox_area": 0.05 * 0.05, + }, + "egohumans_basketball": { + "path": "/datasets/egohumans/skelda/all.json", + "take_interval": 2, + "subset": "basketball", + "min_group_size": 7, + "min_bbox_score": 0.25, + "min_bbox_area": 0.025 * 0.025, + }, + "egohumans_volleyball": { + "path": "/datasets/egohumans/skelda/all.json", + "take_interval": 2, + "subset": "volleyball", + "min_group_size": 11, + "min_bbox_score": 0.25, + "min_bbox_area": 0.05 * 0.05, + }, + "egohumans_badminton": { + "path": "/datasets/egohumans/skelda/all.json", + "take_interval": 2, + "subset": "badminton", + "min_group_size": 7, + "min_bbox_score": 0.25, + "min_bbox_area": 0.05 * 0.05, + }, + "egohumans_tennis": { + "path": "/datasets/egohumans/skelda/all.json", + "take_interval": 2, + "subset": "tennis", + "min_group_size": 11, + "min_bbox_area": 0.025 * 0.025, + }, +} + +joint_names_2d = utils_pipeline.get_joint_names(whole_body) +joint_names_3d = list(joint_names_2d) +eval_joints = [ + "head", + "shoulder_left", + "shoulder_right", + "elbow_left", + "elbow_right", + "wrist_left", + "wrist_right", + "hip_left", + "hip_right", + "knee_left", + "knee_right", + "ankle_left", + "ankle_right", +] +if dataset_use == "human36m": + eval_joints[eval_joints.index("head")] = "nose" +if dataset_use == "panoptic": + eval_joints[eval_joints.index("head")] = "nose" +if dataset_use == "human36m_wb": + if utils_pipeline.use_whole_body(whole_body): + eval_joints = list(joint_names_2d) + else: + eval_joints[eval_joints.index("head")] = "nose" + +# output_dir = "/RapidPoseTriangulation/data/testoutput/" +output_dir = "" + +# ================================================================================================== + + +def load_json(path: str): + with open(path, "r", encoding="utf-8") as file: + data = json.load(file) + return data + + +def save_json(data: dict, path: str): + with open(path, "w+", encoding="utf-8") as file: + json.dump(data, file, indent=0) + + +# ================================================================================================== + + +def load_labels(dataset: dict): + """Load labels by dataset description""" + + if "panoptic" in dataset: + labels = load_json(dataset["panoptic"]["path"]) + labels = [lb for i, lb in enumerate(labels) if i % 1500 < 90] + + # Filter by maximum number of persons + labels = [l for l in labels if len(l["bodies3D"]) <= 10] + + # Filter scenes + if "use_scenes" in dataset["panoptic"]: + labels = [ + l for l in labels if l["scene"] in dataset["panoptic"]["use_scenes"] + ] + + # Filter cameras + if not "cameras_depth" in labels[0]: + for label in labels: + for i, cam in reversed(list(enumerate(label["cameras"]))): + if cam["name"] not in dataset["panoptic"]["cams"]: + label["cameras"].pop(i) + label["imgpaths"].pop(i) + + elif "human36m" in dataset: + labels = load_json(dataset["human36m"]["path"]) + labels = [lb for lb in labels if lb["subject"] == "S9"] + labels = [lb for i, lb in enumerate(labels) if i % 4000 < 150] + + for label in labels: + label.pop("action") + label.pop("frame") + + elif "mvor" in dataset: + labels = load_json(dataset["mvor"]["path"]) + + # Rename keys + for label in labels: + label["cameras_color"] = label["cameras"] + label["imgpaths_color"] = label["imgpaths"] + + elif "ikeaasm" in dataset: + labels = load_json(dataset["ikeaasm"]["path"]) + cams0 = str(labels[0]["cameras"]) + labels = [lb for lb in labels if str(lb["cameras"]) == cams0] + + elif "shelf" in dataset: + labels = load_json(dataset["shelf"]["path"]) + labels = [lb for lb in labels if "test" in lb["splits"]] + + elif "campus" in dataset: + labels = load_json(dataset["campus"]["path"]) + labels = [lb for lb in labels if "test" in lb["splits"]] + + elif "tsinghua" in dataset: + labels = load_json(dataset["tsinghua"]["path"]) + labels = [lb for lb in labels if "test" in lb["splits"]] + labels = [lb for lb in labels if lb["seq"] == "seq_1"] + labels = [lb for i, lb in enumerate(labels) if i % 300 < 90] + + for label in labels: + label["bodyids"] = list(range(len(label["bodies3D"]))) + + elif "chi3d" in dataset: + labels = load_json(dataset["chi3d"]["path"]) + labels = [lb for lb in labels if lb["setup"] == "s03"] + labels = [lb for i, lb in enumerate(labels) if i % 2000 < 150] + + elif "human36m_wb" in dataset: + labels = load_json(dataset["human36m_wb"]["path"]) + + elif any(("egohumans" in key for key in dataset)): + labels = load_json(dataset[dataset_use]["path"]) + labels = [lb for lb in labels if "test" in lb["splits"]] + labels = [lb for lb in labels if dataset[dataset_use]["subset"] in lb["seq"]] + if dataset[dataset_use]["subset"] in ["volleyball", "tennis"]: + labels = [lb for i, lb in enumerate(labels) if i % 150 < 60] + + else: + raise ValueError("Dataset not available") + + # Optionally drop samples to speed up train/eval + if "take_interval" in dataset: + take_interval = dataset["take_interval"] + if take_interval > 1: + labels = [l for i, l in enumerate(labels) if i % take_interval == 0] + + # Add default values + for label in labels: + if "scene" not in label: + label["scene"] = "default" + for cam in label["cameras"]: + if not "type" in cam: + cam["type"] = "pinhole" + + return labels + + +# ================================================================================================== + + +def main(): + global joint_names_3d, eval_joints + + print("Loading dataset ...") + labels = load_labels( + { + dataset_use: datasets[dataset_use], + "take_interval": datasets[dataset_use]["take_interval"], + } + ) + + # Print a dataset sample for debugging + print(labels[0]) + + # Save dataset + tmp_export_dir = "/tmp/rpt/" + for label in labels: + if "splits" in label: + label.pop("splits") + json_writer.save_dataset(labels, tmp_export_dir) + + # Load dataset specific parameters + min_match_score = datasets[dataset_use].get( + "min_match_score", default_min_match_score + ) + min_group_size = datasets[dataset_use].get("min_group_size", default_min_group_size) + min_bbox_score = datasets[dataset_use].get("min_bbox_score", default_min_bbox_score) + min_bbox_area = datasets[dataset_use].get("min_bbox_area", default_min_bbox_area) + batch_poses = datasets[dataset_use].get("batch_poses", default_batch_poses) + + # Save config + config_path = tmp_export_dir + "config.json" + config = { + "min_match_score": min_match_score, + "min_group_size": min_group_size, + "min_bbox_score": min_bbox_score, + "min_bbox_area": min_bbox_area, + "batch_poses": batch_poses, + "whole_body": whole_body, + "take_interval": datasets[dataset_use]["take_interval"], + } + save_json(config, config_path) + + # Call the CPP binary + os.system("/RapidPoseTriangulation/scripts/test_skelda_dataset") + + # Load the results + print("Loading exports ...") + res_path = tmp_export_dir + "results.json" + results = load_json(res_path) + all_poses_3d = results["all_poses_3d"] + all_ids = results["all_ids"] + joint_names_3d = results["joint_names_3d"] + + # Run evaluation + _ = evals.mpjpe.run_eval( + labels, + all_poses_3d, + all_ids, + joint_names_net=joint_names_3d, + joint_names_use=eval_joints, + save_error_imgs=output_dir, + ) + _ = evals.pcp.run_eval( + labels, + all_poses_3d, + all_ids, + joint_names_net=joint_names_3d, + joint_names_use=eval_joints, + replace_head_with_nose=True, + ) + +# ================================================================================================== + +if __name__ == "__main__": + main() diff --git a/scripts/test_skelda_dataset.py b/scripts/test_skelda_dataset_py.py similarity index 100% rename from scripts/test_skelda_dataset.py rename to scripts/test_skelda_dataset_py.py