feat(rgbd): add RGB-D reconstruction pipeline

Add end-to-end RGB-D reconstruction support across the C++ core and Python API.

- add a native merge_rgbd_views path, view-aware 3D pose containers, and nanobind bindings

- expose Python helpers to sample aligned depth, apply per-joint offsets, lift UVD poses to world space, and run reconstruct_rgbd

- add RGB-D regression tests for merging, manual pipeline parity, symmetric depth sampling windows, and out-of-bounds joints

- bump the project version from 0.1.0 to 0.2.0 for the new feature surface
This commit is contained in:
2026-03-26 13:04:57 +08:00
parent 6c09f7044b
commit ed721729fd
12 changed files with 1932 additions and 5 deletions
+1 -1
View File
@@ -1,7 +1,7 @@
cmake_minimum_required(VERSION 3.18) cmake_minimum_required(VERSION 3.18)
project(RapidPoseTriangulation project(RapidPoseTriangulation
VERSION 0.1.0 VERSION 0.2.0
LANGUAGES CXX LANGUAGES CXX
DESCRIPTION "Rapid Pose Triangulation library with Python bindings" DESCRIPTION "Rapid Pose Triangulation library with Python bindings"
) )
+46
View File
@@ -23,6 +23,8 @@ using CountArray = nb::ndarray<nb::numpy, const uint32_t, nb::shape<-1>, nb::c_c
using TrackIdArray = nb::ndarray<nb::numpy, const int64_t, nb::shape<-1>, nb::c_contig>; using TrackIdArray = nb::ndarray<nb::numpy, const int64_t, nb::shape<-1>, nb::c_contig>;
using PoseArray3DConst = using PoseArray3DConst =
nb::ndarray<nb::numpy, const float, nb::shape<-1, -1, 4>, nb::c_contig>; nb::ndarray<nb::numpy, const float, nb::shape<-1, -1, 4>, nb::c_contig>;
using PoseArray3DByViewConst =
nb::ndarray<nb::numpy, const float, nb::shape<-1, -1, -1, 4>, nb::c_contig>;
using PoseArray3D = nb::ndarray<nb::numpy, float, nb::shape<-1, -1, 4>, nb::c_contig>; using PoseArray3D = nb::ndarray<nb::numpy, float, nb::shape<-1, -1, 4>, nb::c_contig>;
using PoseArray2DOut = nb::ndarray<nb::numpy, float, nb::shape<-1, 4>, nb::c_contig>; using PoseArray2DOut = nb::ndarray<nb::numpy, float, nb::shape<-1, 4>, nb::c_contig>;
@@ -59,6 +61,32 @@ PoseBatch3DView pose_batch3d_view_from_numpy(const PoseArray3DConst &poses_3d)
}; };
} }
PoseBatch3DByViewView pose_batch3d_by_view_from_numpy(
const PoseArray3DByViewConst &poses_3d,
const CountArray &person_counts)
{
if (poses_3d.shape(0) != person_counts.shape(0))
{
throw std::invalid_argument("poses_3d and person_counts must have the same number of views.");
}
for (size_t i = 0; i < static_cast<size_t>(person_counts.shape(0)); ++i)
{
if (person_counts(i) > poses_3d.shape(1))
{
throw std::invalid_argument("person_counts entries must not exceed the padded person dimension.");
}
}
return PoseBatch3DByViewView {
poses_3d.data(),
person_counts.data(),
static_cast<size_t>(poses_3d.shape(0)),
static_cast<size_t>(poses_3d.shape(1)),
static_cast<size_t>(poses_3d.shape(2)),
};
}
TrackedPoseBatch3DView tracked_pose_batch_view_from_numpy( TrackedPoseBatch3DView tracked_pose_batch_view_from_numpy(
const PoseArray3DConst &poses_3d, const PoseArray3DConst &poses_3d,
const TrackIdArray &track_ids) const TrackIdArray &track_ids)
@@ -432,6 +460,24 @@ NB_MODULE(_core, m)
"person_counts"_a, "person_counts"_a,
"config"_a); "config"_a);
m.def(
"merge_rgbd_views",
[](const PoseArray3DByViewConst &poses_3d,
const CountArray &person_counts,
const TriangulationConfig &config,
float max_distance)
{
const PoseBatch3D merged = merge_rgbd_views(
pose_batch3d_by_view_from_numpy(poses_3d, person_counts),
config,
max_distance);
return pose_batch_to_numpy(merged);
},
"poses_3d"_a,
"person_counts"_a,
"config"_a,
"max_distance"_a = 0.5f);
m.def( m.def(
"triangulate_with_report", "triangulate_with_report",
[](const PoseArray2D &poses_2d, [](const PoseArray2D &poses_2d,
+1 -1
View File
@@ -7,7 +7,7 @@ build-backend = "scikit_build_core.build"
[project] [project]
name = "rapid-pose-triangulation" name = "rapid-pose-triangulation"
version = "0.1.0" version = "0.2.0"
description = "Rapid Pose Triangulation library with nanobind Python bindings" description = "Rapid Pose Triangulation library with nanobind Python bindings"
readme = "README.md" readme = "README.md"
requires-python = ">=3.10" requires-python = ">=3.10"
+1
View File
@@ -3,6 +3,7 @@
set(RPT_SOURCES set(RPT_SOURCES
camera.cpp camera.cpp
interface.cpp interface.cpp
rgbd_merger.cpp
triangulator.cpp triangulator.cpp
) )
+37
View File
@@ -23,6 +23,17 @@ size_t pose3d_offset(size_t person, size_t joint, size_t coord, size_t num_joint
{ {
return (((person * num_joints) + joint) * 4) + coord; return (((person * num_joints) + joint) * 4) + coord;
} }
size_t pose3d_by_view_offset(
size_t view,
size_t person,
size_t joint,
size_t coord,
size_t max_persons,
size_t num_joints)
{
return ((((view * max_persons) + person) * num_joints) + joint) * 4 + coord;
}
} // namespace } // namespace
// ================================================================================================= // =================================================================================================
@@ -53,6 +64,11 @@ const float &TrackedPoseBatch3DView::at(size_t person, size_t joint, size_t coor
return data[pose3d_offset(person, joint, coord, num_joints)]; return data[pose3d_offset(person, joint, coord, num_joints)];
} }
const float &PoseBatch3DByViewView::at(size_t view, size_t person, size_t joint, size_t coord) const
{
return data[pose3d_by_view_offset(view, person, joint, coord, max_persons, num_joints)];
}
const float &PoseBatch2D::at(size_t view, size_t person, size_t joint, size_t coord) const const float &PoseBatch2D::at(size_t view, size_t person, size_t joint, size_t coord) const
{ {
return data[pose2d_offset(view, person, joint, coord, max_persons, num_joints)]; return data[pose2d_offset(view, person, joint, coord, max_persons, num_joints)];
@@ -129,6 +145,27 @@ PoseBatch3DView PoseBatch3D::view() const
return PoseBatch3DView {data.data(), num_persons, num_joints}; return PoseBatch3DView {data.data(), num_persons, num_joints};
} }
float &PoseBatch3DByView::at(size_t view, size_t person, size_t joint, size_t coord)
{
return data[pose3d_by_view_offset(view, person, joint, coord, max_persons, num_joints)];
}
const float &PoseBatch3DByView::at(size_t view, size_t person, size_t joint, size_t coord) const
{
return data[pose3d_by_view_offset(view, person, joint, coord, max_persons, num_joints)];
}
PoseBatch3DByViewView PoseBatch3DByView::view() const
{
return PoseBatch3DByViewView {
data.data(),
person_counts.data(),
num_views,
max_persons,
num_joints,
};
}
NestedPoses3D PoseBatch3D::to_nested() const NestedPoses3D PoseBatch3D::to_nested() const
{ {
NestedPoses3D poses_3d(num_persons); NestedPoses3D poses_3d(num_persons);
+37
View File
@@ -45,6 +45,17 @@ struct TrackedPoseBatch3DView
const float &at(size_t person, size_t joint, size_t coord) const; const float &at(size_t person, size_t joint, size_t coord) const;
}; };
struct PoseBatch3DByViewView
{
const float *data = nullptr;
const uint32_t *person_counts = nullptr;
size_t num_views = 0;
size_t max_persons = 0;
size_t num_joints = 0;
const float &at(size_t view, size_t person, size_t joint, size_t coord) const;
};
struct PoseBatch2D struct PoseBatch2D
{ {
std::vector<float> data; std::vector<float> data;
@@ -74,6 +85,19 @@ struct PoseBatch3D
static PoseBatch3D from_nested(const NestedPoses3D &poses_3d); static PoseBatch3D from_nested(const NestedPoses3D &poses_3d);
}; };
struct PoseBatch3DByView
{
std::vector<float> data;
std::vector<uint32_t> person_counts;
size_t num_views = 0;
size_t max_persons = 0;
size_t num_joints = 0;
float &at(size_t view, size_t person, size_t joint, size_t coord);
const float &at(size_t view, size_t person, size_t joint, size_t coord) const;
PoseBatch3DByViewView view() const;
};
// ================================================================================================= // =================================================================================================
struct PairCandidate struct PairCandidate
@@ -242,6 +266,11 @@ PoseBatch3D triangulate_poses(
const TriangulationConfig &config, const TriangulationConfig &config,
const TriangulationOptions *options_override = nullptr); const TriangulationOptions *options_override = nullptr);
PoseBatch3D merge_rgbd_views(
const PoseBatch3DByViewView &poses_3d,
const TriangulationConfig &config,
float max_distance = 0.5f);
TriangulationResult triangulate_with_report( TriangulationResult triangulate_with_report(
const PoseBatch2DView &poses_2d, const PoseBatch2DView &poses_2d,
const TriangulationConfig &config, const TriangulationConfig &config,
@@ -256,6 +285,14 @@ inline PoseBatch3D triangulate_poses(
return triangulate_poses(poses_2d.view(), config, options_override); return triangulate_poses(poses_2d.view(), config, options_override);
} }
inline PoseBatch3D merge_rgbd_views(
const PoseBatch3DByView &poses_3d,
const TriangulationConfig &config,
float max_distance = 0.5f)
{
return merge_rgbd_views(poses_3d.view(), config, max_distance);
}
inline TriangulationTrace triangulate_debug( inline TriangulationTrace triangulate_debug(
const PoseBatch2D &poses_2d, const PoseBatch2D &poses_2d,
const TriangulationConfig &config, const TriangulationConfig &config,
+1327
View File
File diff suppressed because it is too large Load Diff
+74 -1
View File
@@ -24,6 +24,7 @@ from ._core import (
build_pair_candidates as _build_pair_candidates, build_pair_candidates as _build_pair_candidates,
filter_pairs_with_previous_poses as _filter_pairs_with_previous_poses, filter_pairs_with_previous_poses as _filter_pairs_with_previous_poses,
make_camera as _make_camera, make_camera as _make_camera,
merge_rgbd_views as _merge_rgbd_views,
triangulate_debug as _triangulate_debug, triangulate_debug as _triangulate_debug,
triangulate_poses as _triangulate_poses, triangulate_poses as _triangulate_poses,
triangulate_with_report as _triangulate_with_report, triangulate_with_report as _triangulate_with_report,
@@ -33,10 +34,11 @@ if TYPE_CHECKING:
import numpy as np import numpy as np
import numpy.typing as npt import numpy.typing as npt
from ._helpers import CameraLike, CameraModelLike, Matrix3x3Like, PoseViewLike, VectorLike from ._helpers import CameraLike, CameraModelLike, DepthImageLike, Matrix3x3Like, PoseViewLike, VectorLike
PoseArray2D = npt.NDArray[np.float32] PoseArray2D = npt.NDArray[np.float32]
PoseArray3D = npt.NDArray[np.float32] PoseArray3D = npt.NDArray[np.float32]
PoseArray3DByView = npt.NDArray[np.float32]
PersonCountArray = npt.NDArray[np.uint32] PersonCountArray = npt.NDArray[np.uint32]
TrackIdArray = npt.NDArray[np.int64] TrackIdArray = npt.NDArray[np.int64]
@@ -103,6 +105,42 @@ def pack_poses_2d(
return _pack_poses_2d(views, joint_count=joint_count) return _pack_poses_2d(views, joint_count=joint_count)
def sample_depth_for_poses(
poses_2d: "PoseArray2D",
person_counts: "PersonCountArray",
depth_images: "Sequence[DepthImageLike]",
*,
window_size: int = 7,
) -> "PoseArray3D":
"""Sample aligned depth for visible 2D joints and return `[u, v, d, score]` rows."""
from ._helpers import sample_depth_for_poses as _sample_depth_for_poses
return _sample_depth_for_poses(poses_2d, person_counts, depth_images, window_size=window_size)
def apply_depth_offsets(
poses_uvd: "PoseArray3D",
joint_names: "Sequence[str]",
) -> "PoseArray3D":
"""Apply the SimpleDepthPose per-joint depth offsets to `[u, v, d, score]` rows."""
from ._helpers import apply_depth_offsets as _apply_depth_offsets
return _apply_depth_offsets(poses_uvd, joint_names)
def lift_depth_poses_to_world(
poses_uvd: "PoseArray3D",
cameras: "Sequence[CameraLike]",
) -> "PoseArray3DByView":
"""Lift `[u, v, d, score]` joints into world-space `[x, y, z, score]` poses."""
from ._helpers import lift_depth_poses_to_world as _lift_depth_poses_to_world
return _lift_depth_poses_to_world(poses_uvd, cameras)
def make_triangulation_config( def make_triangulation_config(
cameras: "Sequence[CameraLike]", cameras: "Sequence[CameraLike]",
roomparams: "npt.NDArray[np.generic] | Sequence[Sequence[float]]", roomparams: "npt.NDArray[np.generic] | Sequence[Sequence[float]]",
@@ -172,6 +210,36 @@ def triangulate_poses(
return _triangulate_poses(poses_2d, person_counts, config) return _triangulate_poses(poses_2d, person_counts, config)
def merge_rgbd_views(
poses_3d: "PoseArray3DByView",
person_counts: "PersonCountArray",
config: TriangulationConfig,
*,
max_distance: float = 0.5,
) -> "PoseArray3D":
"""Merge per-view world-space RGBD pose proposals into final 3D poses."""
return _merge_rgbd_views(poses_3d, person_counts, config, float(max_distance))
def reconstruct_rgbd(
poses_2d: "PoseArray2D",
person_counts: "PersonCountArray",
depth_images: "Sequence[DepthImageLike]",
config: TriangulationConfig,
*,
use_depth_offsets: bool = True,
window_size: int = 7,
max_distance: float = 0.5,
) -> "PoseArray3D":
"""Reconstruct per-frame RGBD poses from calibrated detections and aligned depth images."""
poses_uvd = sample_depth_for_poses(poses_2d, person_counts, depth_images, window_size=window_size)
if use_depth_offsets:
poses_uvd = apply_depth_offsets(poses_uvd, config.joint_names)
poses_3d = lift_depth_poses_to_world(poses_uvd, config.cameras)
return merge_rgbd_views(poses_3d, person_counts, config, max_distance=max_distance)
def triangulate_with_report( def triangulate_with_report(
poses_2d: "PoseArray2D", poses_2d: "PoseArray2D",
person_counts: "PersonCountArray", person_counts: "PersonCountArray",
@@ -200,6 +268,7 @@ __all__ = [
"CameraModel", "CameraModel",
"AssociationReport", "AssociationReport",
"AssociationStatus", "AssociationStatus",
"apply_depth_offsets",
"FinalPoseAssociationDebug", "FinalPoseAssociationDebug",
"TriangulationConfig", "TriangulationConfig",
"TriangulationOptions", "TriangulationOptions",
@@ -216,9 +285,13 @@ __all__ = [
"build_pair_candidates", "build_pair_candidates",
"convert_cameras", "convert_cameras",
"filter_pairs_with_previous_poses", "filter_pairs_with_previous_poses",
"lift_depth_poses_to_world",
"make_camera", "make_camera",
"make_triangulation_config", "make_triangulation_config",
"merge_rgbd_views",
"pack_poses_2d", "pack_poses_2d",
"reconstruct_rgbd",
"sample_depth_for_poses",
"triangulate_debug", "triangulate_debug",
"triangulate_poses", "triangulate_poses",
"triangulate_with_report", "triangulate_with_report",
+44 -1
View File
@@ -23,10 +23,11 @@ from ._core import (
TriangulationResult, TriangulationResult,
TriangulationTrace, TriangulationTrace,
) )
from ._helpers import CameraLike, CameraModelLike, Matrix3x3Like, PoseViewLike, RoomParamsLike, VectorLike from ._helpers import CameraLike, CameraModelLike, DepthImageLike, Matrix3x3Like, PoseViewLike, RoomParamsLike, VectorLike
PoseArray2D: TypeAlias = npt.NDArray[np.float32] PoseArray2D: TypeAlias = npt.NDArray[np.float32]
PoseArray3D: TypeAlias = npt.NDArray[np.float32] PoseArray3D: TypeAlias = npt.NDArray[np.float32]
PoseArray3DByView: TypeAlias = npt.NDArray[np.float32]
PersonCountArray: TypeAlias = npt.NDArray[np.uint32] PersonCountArray: TypeAlias = npt.NDArray[np.uint32]
TrackIdArray: TypeAlias = npt.NDArray[np.int64] TrackIdArray: TypeAlias = npt.NDArray[np.int64]
@@ -59,6 +60,27 @@ def pack_poses_2d(
) -> tuple[npt.NDArray[np.float32], npt.NDArray[np.uint32]]: ... ) -> tuple[npt.NDArray[np.float32], npt.NDArray[np.uint32]]: ...
def sample_depth_for_poses(
poses_2d: PoseArray2D,
person_counts: PersonCountArray,
depth_images: Sequence[DepthImageLike],
*,
window_size: int = 7,
) -> PoseArray3D: ...
def apply_depth_offsets(
poses_uvd: PoseArray3D,
joint_names: Sequence[str],
) -> PoseArray3D: ...
def lift_depth_poses_to_world(
poses_uvd: PoseArray3D,
cameras: Sequence[CameraLike],
) -> PoseArray3DByView: ...
def make_triangulation_config( def make_triangulation_config(
cameras: Sequence[CameraLike], cameras: Sequence[CameraLike],
roomparams: RoomParamsLike, roomparams: RoomParamsLike,
@@ -103,6 +125,27 @@ def triangulate_poses(
) -> PoseArray3D: ... ) -> PoseArray3D: ...
def merge_rgbd_views(
poses_3d: PoseArray3DByView,
person_counts: PersonCountArray,
config: TriangulationConfig,
*,
max_distance: float = 0.5,
) -> PoseArray3D: ...
def reconstruct_rgbd(
poses_2d: PoseArray2D,
person_counts: PersonCountArray,
depth_images: Sequence[DepthImageLike],
config: TriangulationConfig,
*,
use_depth_offsets: bool = True,
window_size: int = 7,
max_distance: float = 0.5,
) -> PoseArray3D: ...
def triangulate_with_report( def triangulate_with_report(
poses_2d: PoseArray2D, poses_2d: PoseArray2D,
person_counts: PersonCountArray, person_counts: PersonCountArray,
+166
View File
@@ -12,6 +12,7 @@ Matrix3x3Like: TypeAlias = Sequence[Sequence[float]]
VectorLike: TypeAlias = Sequence[float] VectorLike: TypeAlias = Sequence[float]
RoomParamsLike: TypeAlias = npt.NDArray[np.generic] | Sequence[Sequence[float]] RoomParamsLike: TypeAlias = npt.NDArray[np.generic] | Sequence[Sequence[float]]
PoseViewLike: TypeAlias = npt.NDArray[np.generic] | Sequence[Sequence[Sequence[float]]] | Sequence[Sequence[float]] PoseViewLike: TypeAlias = npt.NDArray[np.generic] | Sequence[Sequence[Sequence[float]]] | Sequence[Sequence[float]]
DepthImageLike: TypeAlias = npt.NDArray[np.generic] | Sequence[Sequence[float]]
class CameraDict(TypedDict, total=False): class CameraDict(TypedDict, total=False):
@@ -29,6 +30,29 @@ class CameraDict(TypedDict, total=False):
CameraModelLike: TypeAlias = CameraModel | Literal["pinhole", "fisheye"] CameraModelLike: TypeAlias = CameraModel | Literal["pinhole", "fisheye"]
CameraLike = Camera | CameraDict CameraLike = Camera | CameraDict
DEFAULT_DEPTH_OFFSETS_METERS: dict[str, float] = {
"nose": 0.005,
"eye_left": 0.005,
"eye_right": 0.005,
"ear_left": 0.005,
"ear_right": 0.005,
"shoulder_left": 0.03,
"shoulder_right": 0.03,
"elbow_left": 0.02,
"elbow_right": 0.02,
"wrist_left": 0.01,
"wrist_right": 0.01,
"hip_left": 0.04,
"hip_right": 0.04,
"knee_left": 0.03,
"knee_right": 0.03,
"ankle_left": 0.03,
"ankle_right": 0.03,
"hip_middle": 0.04,
"shoulder_middle": 0.03,
"head": 0.0,
}
def _coerce_camera_model(model: CameraModelLike) -> CameraModel: def _coerce_camera_model(model: CameraModelLike) -> CameraModel:
if isinstance(model, CameraModel): if isinstance(model, CameraModel):
@@ -55,6 +79,15 @@ def _coerce_distortion(distortion: VectorLike, camera_model: CameraModel) -> tup
return values return values
def _coerce_depth_image(depth_image: DepthImageLike) -> npt.NDArray[np.float32]:
array = np.asarray(depth_image, dtype=np.float32)
if array.ndim == 3 and array.shape[-1] == 1:
array = np.squeeze(array, axis=-1)
if array.ndim != 2:
raise ValueError("Each depth image must have shape [height, width] or [height, width, 1].")
return np.ascontiguousarray(array, dtype=np.float32)
def convert_cameras(cameras: Sequence[CameraLike]) -> list[Camera]: def convert_cameras(cameras: Sequence[CameraLike]) -> list[Camera]:
"""Normalize mappings or existing Camera objects into bound Camera instances.""" """Normalize mappings or existing Camera objects into bound Camera instances."""
@@ -157,3 +190,136 @@ def make_triangulation_config(
options.min_group_size = int(min_group_size) options.min_group_size = int(min_group_size)
config.options = options config.options = options
return config return config
def sample_depth_for_poses(
poses_2d: npt.NDArray[np.generic],
person_counts: npt.NDArray[np.generic],
depth_images: Sequence[DepthImageLike],
*,
window_size: int = 7,
) -> npt.NDArray[np.float32]:
"""Sample aligned depth for each visible 2D joint and return `[u, v, d, score]` rows."""
poses = np.asarray(poses_2d, dtype=np.float32)
counts = np.asarray(person_counts, dtype=np.uint32)
if poses.ndim != 4 or poses.shape[-1] != 3:
raise ValueError("poses_2d must have shape [views, max_persons, joints, 3].")
if counts.ndim != 1 or counts.shape[0] != poses.shape[0]:
raise ValueError("person_counts must be a 1D array aligned with the pose views.")
if len(depth_images) != poses.shape[0]:
raise ValueError("depth_images must have the same number of views as poses_2d.")
if window_size <= 0:
raise ValueError("window_size must be positive.")
radius = window_size // 2
poses_uvd = np.zeros((poses.shape[0], poses.shape[1], poses.shape[2], 4), dtype=np.float32)
for view_idx, depth_image in enumerate(depth_images):
depth = _coerce_depth_image(depth_image)
poses_uvd[view_idx, :, :, :2] = poses[view_idx, :, :, :2]
poses_uvd[view_idx, :, :, 3] = poses[view_idx, :, :, 2]
valid_persons = int(counts[view_idx])
if valid_persons == 0:
continue
joints = poses[view_idx, :valid_persons, :, :2].astype(np.int32, copy=False).reshape(-1, 2)
scores = poses[view_idx, :valid_persons, :, 2:3].reshape(-1, 1)
depth_padded = np.pad(depth, radius, mode="constant", constant_values=0)
offsets = np.arange(-radius, radius + 1, dtype=np.int32)
valid_xy = (
(joints[:, 0] >= 0)
& (joints[:, 0] < depth.shape[1])
& (joints[:, 1] >= 0)
& (joints[:, 1] < depth.shape[0])
)
clamped_x = np.clip(joints[:, 0], 0, depth.shape[1] - 1)
clamped_y = np.clip(joints[:, 1], 0, depth.shape[0] - 1)
center_x = clamped_x[:, None] + radius
center_y = clamped_y[:, None] + radius
vertical_grid = np.clip(np.add.outer(clamped_y, offsets) + radius, 0, depth_padded.shape[0] - 1)
horizontal_grid = np.clip(
np.add.outer(clamped_x, offsets) + radius, 0, depth_padded.shape[1] - 1
)
vertical_depths = depth_padded[vertical_grid, center_x]
horizontal_depths = depth_padded[center_y, horizontal_grid]
all_depths = np.concatenate((vertical_depths, horizontal_depths), axis=1).astype(np.float32)
all_depths[~valid_xy] = np.nan
all_depths[all_depths <= 0] = np.nan
valid_depth_rows = ~np.isnan(all_depths).all(axis=1)
sampled_depths = np.zeros((all_depths.shape[0],), dtype=np.float32)
if np.any(valid_depth_rows):
with np.errstate(all="ignore"):
sampled_depths[valid_depth_rows] = np.nanmedian(all_depths[valid_depth_rows], axis=1)
valid_mask = ((sampled_depths > 0.0).astype(np.float32)[:, None] * (scores > 0.0).astype(np.float32))
sampled_depths = sampled_depths.reshape(valid_persons, poses.shape[2], 1)
valid_mask = valid_mask.reshape(valid_persons, poses.shape[2], 1)
poses_uvd[view_idx, :valid_persons, :, 2:3] = sampled_depths
poses_uvd[view_idx, :valid_persons] *= np.concatenate((valid_mask, valid_mask, valid_mask, valid_mask), axis=-1)
return poses_uvd
def apply_depth_offsets(
poses_uvd: npt.NDArray[np.generic],
joint_names: Sequence[str],
) -> npt.NDArray[np.float32]:
"""Apply the SimpleDepthPose per-joint depth offsets in meters."""
poses = np.asarray(poses_uvd, dtype=np.float32)
if poses.ndim != 4 or poses.shape[-1] != 4:
raise ValueError("poses_uvd must have shape [views, max_persons, joints, 4].")
if len(joint_names) != poses.shape[2]:
raise ValueError("joint_names must have the same number of joints as poses_uvd.")
result = poses.copy()
offsets = np.asarray(
[DEFAULT_DEPTH_OFFSETS_METERS.get(str(joint_name), 0.0) for joint_name in joint_names],
dtype=np.float32,
)
depth_mask = (result[:, :, :, 2:3] > 0.0).astype(np.float32)
result[:, :, :, 2:3] += depth_mask * offsets[np.newaxis, np.newaxis, :, np.newaxis] * 1000.0
return result
def lift_depth_poses_to_world(
poses_uvd: npt.NDArray[np.generic],
cameras: Sequence[CameraLike],
) -> npt.NDArray[np.float32]:
"""Lift `[u, v, d, score]` joints into world-space `[x, y, z, score]` poses."""
poses = np.asarray(poses_uvd, dtype=np.float32)
if poses.ndim != 4 or poses.shape[-1] != 4:
raise ValueError("poses_uvd must have shape [views, max_persons, joints, 4].")
converted_cameras = convert_cameras(cameras)
if len(converted_cameras) != poses.shape[0]:
raise ValueError("cameras must have the same number of views as poses_uvd.")
result = np.zeros_like(poses, dtype=np.float32)
for view_idx, camera in enumerate(converted_cameras):
uv = poses[view_idx, :, :, :2].reshape(-1, 2)
depth_mm = poses[view_idx, :, :, 2:3].reshape(-1, 1)
scores = poses[view_idx, :, :, 3:4].reshape(-1, 1)
depth_m = depth_mm * 0.001
uv_ones = np.concatenate((uv, np.ones((uv.shape[0], 1), dtype=np.float32)), axis=1)
k_inv = np.linalg.inv(np.asarray(camera.K, dtype=np.float32))
xyz_cam = depth_m * (uv_ones @ k_inv.T)
rotation = np.asarray(camera.R, dtype=np.float32)
translation = np.asarray(camera.T, dtype=np.float32).reshape(1, 3)
xyz_world = (rotation @ xyz_cam.T).T + translation
pose_world = np.concatenate((xyz_world, scores), axis=1).reshape(
poses.shape[1], poses.shape[2], 4
)
pose_world *= (pose_world[:, :, 3:4] > 0.0).astype(np.float32)
result[view_idx] = pose_world
return result
+197
View File
@@ -0,0 +1,197 @@
import numpy as np
import rpt
JOINT_NAMES = [
"nose",
"eye_left",
"eye_right",
"ear_left",
"ear_right",
"shoulder_left",
"shoulder_right",
"elbow_left",
"elbow_right",
"wrist_left",
"wrist_right",
"hip_left",
"hip_right",
"knee_left",
"knee_right",
"ankle_left",
"ankle_right",
"hip_middle",
"shoulder_middle",
"head",
]
def make_camera(name: str) -> rpt.Camera:
return rpt.make_camera(
name,
[[1000, 0, 0], [0, 1000, 0], [0, 0, 1]],
[0, 0, 0, 0, 0],
[[1, 0, 0], [0, 1, 0], [0, 0, 1]],
[[0], [0], [0]],
256,
256,
rpt.CameraModel.PINHOLE,
)
def make_config(num_views: int) -> rpt.TriangulationConfig:
return rpt.make_triangulation_config(
[make_camera(f"Camera {idx}") for idx in range(num_views)],
np.asarray([[10.0, 10.0, 10.0], [0.0, 0.0, 0.0]], dtype=np.float32),
JOINT_NAMES,
)
def make_body_2d() -> np.ndarray:
return np.asarray(
[
[150, 50, 1.0],
[145, 48, 1.0],
[155, 48, 1.0],
[138, 50, 1.0],
[162, 50, 1.0],
[135, 80, 1.0],
[165, 80, 1.0],
[125, 115, 1.0],
[175, 115, 1.0],
[115, 150, 1.0],
[185, 150, 1.0],
[145, 130, 1.0],
[155, 130, 1.0],
[145, 175, 1.0],
[155, 175, 1.0],
[145, 220, 1.0],
[155, 220, 1.0],
[150, 130, 1.0],
[150, 80, 1.0],
[150, 50, 1.0],
],
dtype=np.float32,
)
def test_sample_depth_for_poses_respects_person_counts_and_scores():
poses_2d = np.zeros((1, 2, 2, 3), dtype=np.float32)
poses_2d[0, 0, 0] = [5, 6, 0.8]
poses_2d[0, 0, 1] = [7, 8, 0.0]
person_counts = np.asarray([1], dtype=np.uint32)
depth_image = np.full((16, 16), 3000, dtype=np.float32)
depth_image[0, 0] = 1234
poses_uvd = rpt.sample_depth_for_poses(poses_2d, person_counts, [depth_image])
np.testing.assert_allclose(poses_uvd[0, 0, 0], [5.0, 6.0, 3000.0, 0.8], rtol=1e-6, atol=1e-6)
np.testing.assert_array_equal(poses_uvd[0, 0, 1], np.zeros((4,), dtype=np.float32))
np.testing.assert_array_equal(poses_uvd[0, 1], np.zeros((2, 4), dtype=np.float32))
def test_sample_depth_for_poses_uses_symmetric_window():
poses_2d = np.zeros((1, 1, 1, 3), dtype=np.float32)
poses_2d[0, 0, 0] = [5, 5, 1.0]
person_counts = np.asarray([1], dtype=np.uint32)
depth_image = np.zeros((16, 16), dtype=np.float32)
depth_image[5, 5] = 1000.0
depth_image[3, 5] = 5000.0
depth_image[5, 2] = 5000.0
depth_image[5, 3] = 5000.0
depth_image[5, 7] = 5000.0
depth_image[5, 8] = 5000.0
poses_uvd = rpt.sample_depth_for_poses(poses_2d, person_counts, [depth_image], window_size=3)
np.testing.assert_allclose(poses_uvd[0, 0, 0], [5.0, 5.0, 1000.0, 1.0], rtol=1e-6, atol=1e-6)
def test_sample_depth_for_poses_ignores_out_of_bounds_joints():
poses_2d = np.zeros((1, 1, 1, 3), dtype=np.float32)
poses_2d[0, 0, 0] = [99, -4, 0.7]
person_counts = np.asarray([1], dtype=np.uint32)
poses_uvd = rpt.sample_depth_for_poses(
poses_2d,
person_counts,
[np.full((16, 16), 3000, dtype=np.float32)],
)
np.testing.assert_array_equal(poses_uvd[0, 0, 0], np.zeros((4,), dtype=np.float32))
def test_apply_depth_offsets_uses_joint_name_mapping():
poses_uvd = np.zeros((1, 1, 3, 4), dtype=np.float32)
poses_uvd[0, 0, :, 2] = 3000.0
poses_uvd[0, 0, :, 3] = 1.0
adjusted = rpt.apply_depth_offsets(poses_uvd, ["nose", "shoulder_left", "unknown_joint"])
np.testing.assert_allclose(adjusted[0, 0, :, 2], [3005.0, 3030.0, 3000.0], rtol=1e-6, atol=1e-6)
np.testing.assert_allclose(poses_uvd[0, 0, :, 2], [3000.0, 3000.0, 3000.0], rtol=1e-6, atol=1e-6)
def test_lift_depth_poses_to_world_matches_camera_projection():
poses_uvd = np.zeros((1, 1, 2, 4), dtype=np.float32)
poses_uvd[0, 0, 0] = [100.0, 200.0, 3000.0, 0.9]
poses_uvd[0, 0, 1] = [0.0, 0.0, 0.0, 0.0]
lifted = rpt.lift_depth_poses_to_world(poses_uvd, [make_camera("Camera 1")])
np.testing.assert_allclose(lifted[0, 0, 0], [0.3, 0.6, 3.0, 0.9], rtol=1e-6, atol=1e-6)
np.testing.assert_array_equal(lifted[0, 0, 1], np.zeros((4,), dtype=np.float32))
def test_merge_rgbd_views_merges_identical_world_poses():
config = make_config(2)
body_2d = make_body_2d()
poses_2d = np.zeros((2, 1, len(JOINT_NAMES), 3), dtype=np.float32)
poses_2d[0, 0] = body_2d
poses_2d[1, 0] = body_2d
person_counts = np.asarray([1, 1], dtype=np.uint32)
depth_images = [np.full((256, 256), 3000, dtype=np.float32) for _ in range(2)]
poses_uvd = rpt.sample_depth_for_poses(poses_2d, person_counts, depth_images)
poses_uvd = rpt.apply_depth_offsets(poses_uvd, JOINT_NAMES)
poses_3d_by_view = rpt.lift_depth_poses_to_world(poses_uvd, config.cameras)
merged = rpt.merge_rgbd_views(poses_3d_by_view, person_counts, config)
assert merged.shape == (1, len(JOINT_NAMES), 4)
np.testing.assert_allclose(merged[0, :-1], poses_3d_by_view[0, 0, :-1], rtol=1e-5, atol=1e-5)
expected_head = (poses_3d_by_view[0, 0, 3] + poses_3d_by_view[0, 0, 4]) * 0.5
expected_head[3] = min(poses_3d_by_view[0, 0, 3, 3], poses_3d_by_view[0, 0, 4, 3])
np.testing.assert_allclose(merged[0, -1], expected_head, rtol=1e-5, atol=1e-5)
def test_reconstruct_rgbd_matches_manual_pipeline_and_single_view_person():
config = make_config(2)
body_2d = make_body_2d()
poses_2d = np.zeros((2, 1, len(JOINT_NAMES), 3), dtype=np.float32)
poses_2d[0, 0] = body_2d
person_counts = np.asarray([1, 0], dtype=np.uint32)
depth_images = [
np.full((256, 256), 3000, dtype=np.float32),
np.zeros((256, 256), dtype=np.float32),
]
manual = rpt.merge_rgbd_views(
rpt.lift_depth_poses_to_world(
rpt.apply_depth_offsets(
rpt.sample_depth_for_poses(poses_2d, person_counts, depth_images),
JOINT_NAMES,
),
config.cameras,
),
person_counts,
config,
)
reconstructed = rpt.reconstruct_rgbd(poses_2d, person_counts, depth_images, config)
assert reconstructed.shape == (1, len(JOINT_NAMES), 4)
np.testing.assert_allclose(reconstructed, manual, rtol=1e-5, atol=1e-5)
assert np.count_nonzero(reconstructed[0, :, 3] > 0.0) >= 7
Generated
+1 -1
View File
@@ -230,7 +230,7 @@ wheels = [
[[package]] [[package]]
name = "rapid-pose-triangulation" name = "rapid-pose-triangulation"
version = "0.1.0" version = "0.2.0"
source = { editable = "." } source = { editable = "." }
dependencies = [ dependencies = [
{ name = "numpy", version = "2.2.6", source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.2.6", source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" }, marker = "python_full_version < '3.11'" },