94 lines
2.1 KiB
Python
94 lines
2.1 KiB
Python
from typing import (
|
|
Any,
|
|
Deque,
|
|
Dict,
|
|
List,
|
|
Literal,
|
|
Optional,
|
|
Sequence,
|
|
Tuple,
|
|
TypedDict,
|
|
Union,
|
|
cast,
|
|
)
|
|
|
|
import numpy as np
|
|
from jaxtyping import Float, Int, Num, Shaped, jaxtyped
|
|
|
|
try:
|
|
from cv2.typing import MatLike
|
|
except ImportError:
|
|
MatLike = np.ndarray
|
|
|
|
NDArray = np.ndarray
|
|
|
|
BoundingBoxFormat = Literal["xyxy", "xywh"]
|
|
|
|
|
|
class DetectionResult(TypedDict):
|
|
"""
|
|
Detection result per frame
|
|
|
|
N is the number of detected objects
|
|
"""
|
|
|
|
boxes_num: Num[NDArray, "1"]
|
|
boxes: Num[NDArray, "N 4"]
|
|
scores: Num[NDArray, "N"]
|
|
reference_frame_size: Tuple[int, int]
|
|
"""
|
|
Height and width of reference frame.
|
|
|
|
The bounding box coordinates are relative to this frame.
|
|
|
|
If one resizes the reference frame, the bounding box and keypoint coordinates should be scaled accordingly.
|
|
"""
|
|
|
|
|
|
H36KeyPoints = Float[NDArray, "B F 17 2"]
|
|
PersonBasedKeypointLike = Float[NDArray, "B F N 2"]
|
|
BoundingBoxes = Float[NDArray, "B F 4"]
|
|
|
|
|
|
class KeyPointDetectionResult(TypedDict):
|
|
"""
|
|
keypoints, bounding boxes, and scores
|
|
"""
|
|
|
|
skeleton_keypoints: Float[NDArray, "N 17 2"]
|
|
skeleton_keypoints_scores: Float[NDArray, "N 17 1"]
|
|
bboxes: Float[NDArray, "N 4"]
|
|
bboxes_scores: Optional[Float[NDArray, "N"]]
|
|
frame_number: int
|
|
"""
|
|
The frame number in the video sequence.
|
|
-1 when the frame number is not available.
|
|
"""
|
|
reference_frame_size: tuple[int, int]
|
|
"""
|
|
Height and Width of the reference frame.
|
|
|
|
The bounding box coordinates and keypoint coordinates are relative to this frame.
|
|
|
|
If one resizes the reference frame, the bounding box and keypoint coordinates should be scaled accordingly.
|
|
"""
|
|
|
|
|
|
KeyPointDetectionTimeSeries = Sequence[KeyPointDetectionResult]
|
|
"""
|
|
Each item contains keypoint detection result for all people detected in a single frame.
|
|
|
|
Intervals between frames are not guaranteed to be consistent
|
|
"""
|
|
|
|
|
|
class MixKeypoints(TypedDict):
|
|
MixThreeDkeypoints: Num[NDArray, "... N 3"]
|
|
Channels: Any
|
|
Header: Any
|
|
|
|
|
|
class ReferenceFrameSize(TypedDict):
|
|
width: int
|
|
height: int
|