from typing import Any, Union import numpy as np from jaxtyping import Float, Int, Shaped, Num, jaxtyped from typing import ( Literal, List, Dict, TypedDict, Any, cast, Tuple, Optional, Sequence, Deque, ) try: from cv2.typing import MatLike except ImportError: MatLike = np.ndarray NDArray = np.ndarray BoundingBoxFormat = Literal["xyxy", "xywh"] class DetectionResult(TypedDict): """ Detection result per frame N is the number of detected objects """ boxes_num: Num[NDArray, "1"] boxes: Num[NDArray, "N 4"] scores: Num[NDArray, "N"] reference_frame_size: Tuple[int, int] """ Height and width of reference frame. The bounding box coordinates are relative to this frame. If one resizes the reference frame, the bounding box and keypoint coordinates should be scaled accordingly. """ H36KeyPoints = Float[NDArray, "B F 17 2"] PersonBasedKeypointLike = Float[NDArray, "B F N 2"] BoundingBoxes = Float[NDArray, "B F 4"] class KeyPointDetectionResult(TypedDict): """ keypoints, bounding boxes, and scores """ skeleton_keypoints: Float[NDArray, "N 17 2"] skeleton_keypoints_scores: Float[NDArray, "N 17 1"] bboxes: Float[NDArray, "N 4"] bboxes_scores: Optional[Float[NDArray, "N"]] frame_number: int """ The frame number in the video sequence. -1 when the frame number is not available. """ reference_frame_size: tuple[int, int] """ Height and Width of the reference frame. The bounding box coordinates and keypoint coordinates are relative to this frame. If one resizes the reference frame, the bounding box and keypoint coordinates should be scaled accordingly. """ KeyPointDetectionTimeSeries = Sequence[KeyPointDetectionResult] """ Each item contains keypoint detection result for all people detected in a single frame. Intervals between frames are not guaranteed to be consistent """ class MixKeypoints(TypedDict): MixThreeDkeypoints: Num[NDArray, "... N 3"] Channels: Any Header: Any class ReferenceFrameSize(TypedDict): width: int height: int